In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.mkt_calendar import *

# Data Cleaning 4: Variable Transformation

In this notebook we will do the following:

- Create `amend` and `choice` binary variables.
- Create competing group number and competing status code.
- Clean payment type.
- Extract cash and stock terms from consideration.

## I/O

- Input:
    - `data/intermediate/df_permno_delist_CRSP.h5`
    
- Output:
    - `data/cleaned/df_cleaned.h5`
    - `data/cleaned/df_cleaned.csv`


## Load data

In [3]:
filepath = f"{path}/data/intermediate/df_permno_delist_CRSP.h5"
df = pd.read_hdf(filepath)

# Create `amend` and `choice` binary variables

In [4]:
# create amend
insert_cols(df, 'valamend', 'amend', create_amend(df))

# create consid
insert_cols(df, 'consid', 'choice', create_choice(df))

# Create competing group number and status code

## Create competing group number

In [5]:
# change to 0/1
df.cha = df.cha.replace({'Yes':1, 'No':0})

In [6]:
# create competing deal group numbers
insert_cols(df, 'lbo', 'compete_group_no', create_compete_group_no(df))

# check error
if df.compete_group_no.eq(-1).any():
    print('Different group numbers in one group.')
if df.compete_group_no.value_counts().eq(1).any():
    print('Group of a single deal.')

## Create competing status code

In [7]:
# create competing status code
insert_cols(df, 'lbo', 'compete_statc_code', create_compete_status_code(df))

In [8]:
df.compete_statc_code.value_counts()

2.0    595
0.0    449
1.0     63
9.0      3
3.0      3
Name: compete_statc_code, dtype: int64

# Create new payment type `consido`

In [9]:
extract_all_payment_types(df.consido)

array(['Amer. Depy. Receipt', 'American Depy Share', 'Assets', 'Bond',
       'Capital Infusion', 'Capital Stock', 'Cash', 'Cash Dividend',
       'Cash Only', 'Common L.P. Unit', 'Common Stock',
       'Common Stock Type A', 'Common Stock Type B',
       'Common Stock Type C', 'Common Stock Type E',
       'Common Stock Type H', 'Conting. Value Right',
       'Contingent Val Pref', 'Convert. Debenture', 'Convert. Exch. Pfd.',
       'Convert. Preferred', 'Convert. Preferred A',
       'Convert. Preferred B', 'Convert. Preferred D',
       'Convert. Securities', 'Convert. Senior Pfd.',
       'Convert. Sub. Deb.', 'Convert. Sub. Note', 'Convertible Note',
       'Cum Cvt Exch Pfd', 'Cum. Convert. Pfd.', 'Cum. Exch. Preferred',
       'Cumulative Preferred', 'Cvt Cum Pfd Ser A',
       'Cvt. Promissory Note', 'Debenture', 'Debt', 'Debt Reduction',
       'Depositary Receipt', 'Depositary Share', 'Earnout',
       'Exch Redeemable Pfd', 'Exch. Preferred', 'Exch. Sub. Deb.',
       'Jr. S

In [10]:
lst_cash = ['Cash', 'Cash Only']
lst_stock = ['Common Stock', 'Common Stock Type A', 'Common Stock Type B', 'Common Stock Type C', 'Common Stock Type E', 
             'Common Stock Type H', 'Newly Issued Ord Sh', 'Ordinary Share', ]

In [11]:
# # to be deal with later
# replace_lst_more = ['Amer. Depy. Receipt', 
#                     'American Depy Share',
#                     'Cash Dividend'
#                     'Common L.P. Unit',
#                    'Depositary Receipt',
#                     'Depositary Share',
#                    'L.P. Unit',
#                    'Unit']

In [12]:
# new consido
insert_cols(df, 'consido', 'consido_new', df.consido.map(lambda x: transform_payment_str(x, lst_cash, lst_stock), na_action='ignore'))
df.consido_new.value_counts()

Cash                     6140
Common Stock             2638
Cash and Common Stock    1582
No Cash or Stock           82
Name: consido_new, dtype: int64

# Extract cash and stock terms from consideration.

In [13]:
# correct consideration
cols = ['consid', 'consido']
df[cols] = correct_consid(df[cols])

In [14]:
# convert `consid` column to a more readable format
df.consid = df.consid.map(convert_consid_to_readable, na_action='ignore')

In [15]:
# extract terms from `consid`
insert_cols(df, "consid", "terms", extract_term_from_consid(df.consid))

In [16]:
# extract cash and stock terms
cash_stock_terms = apply_func_to_ser_df(df.terms, extract_cash_stock_from_term, return_as_df=True) 

100%|██████████████████████████████████| 10084/10084 [00:00<00:00, 18557.67it/s]


In [17]:
insert_cols(df, ['consid']*3, None, cash_stock_terms)

In [18]:
df.payment_type.value_counts()

Cash                                   5345
Common Stock                           2138
parse failed                           1838
Cash and Common Stock                   428
Common Stock, fixed dollar              236
Cash and Common Stock, fixed dollar      99
Name: payment_type, dtype: int64

In [19]:
# create stock indicator
insert_cols(df, 'payment_type', "stock", df.payment_type.isin(['Common Stock', 'Cash and Common Stock']).astype(int))

# add duration

In [20]:
insert_cols(df, "definitive_agt", "duration", get_num_trading_days_between(df.da, df.dr))

# Save results

In [21]:
filepath = f"{path}/data/cleaned/df_cleaned.h5"

df.to_hdf(filepath, key = 'df_cleaned', mode='w')
df.to_csv(filepath.replace("h5", "csv"))