In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
# import wrds

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Data preprocessing 4: Variable Transformation

In this notebook we will do the following:

- Create `amend` and `choice` binary variables.
- Create competing group number and competing status code.
- Create deal value adjusted by CPI index.
- Create new payment type.
- Extract cash and stock terms from consideration.

## I/O

- Input:
    - `df_dates_corrected.h5`
    
- Output:
    - `df_variable_transform.h5`

## Load data

In [2]:
filepath = f"{path}/data/df_dates_corrected.h5"
df = pd.read_hdf(filepath)

# Create `amend` and `choice` binary variables

In [3]:
# create amend
dsk.insert_cols(df, 'valamend', 'amend', dsk.create_amend(df))

# create consid
dsk.insert_cols(df, 'consid', 'choice', dsk.create_choice(df))

# Create competing group number and status code

## Create competing group number

In [4]:
# change to 0/1
df.cha = df.cha.replace({'Yes':1, 'No':0})

In [5]:
# create competing deal group numbers
dsk.insert_cols(df, 'lbo', 'compete_group_no', dsk.create_compete_group_no(df))

# update cha. Some errors in the database
df.cha[~df.compete_group_no.isna()] = 1

# check error
if df.compete_group_no.eq(-1).any():
    print('Different group numbers in one group.')
if df.compete_group_no.value_counts().eq(1).any():
    print('Group of a single deal.')

## Create competing status code

In [6]:
# create competing status code
dsk.insert_cols(df, 'lbo', 'compete_statc_code', dsk.create_compete_status_code(df))

In [7]:
df.compete_statc_code.value_counts()

2.0    700
0.0    502
1.0     85
9.0      3
3.0      3
Name: compete_statc_code, dtype: int64

# Create announced year and month. Create `val` adjusted by CPI index

## Create announced year and month.
Create announced year and month.

In [8]:
# create announced year and month
df['ann_year'] = df.da.map(lambda x: x.year)     # useful later
df['ann_year_month'] = df.da.map(lambda x: pd.Period(f'{x.year}-{x.month}', 'M'))

## Create deal value adjusted by CPI index. Delete monthly period column.
CPI data is downloaded from FRED [website](https://fred.stlouisfed.org/series/CPIAUCSL). We set the value of Dec 2020 as the base, and inflate all the deal values beforehand by the CPI index.

In [9]:
html = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=CPIAUCSL"
df_cpi = pd.read_csv(html)

In [10]:
cpi_ser = dsk.to_monthly_period_index(df_cpi, 'DATE').squeeze()
cpi_ser.head()

month
1947-01    21.48
1947-02    21.62
1947-03    22.00
1947-04    22.00
1947-05    21.95
Freq: M, Name: CPIAUCSL, dtype: float64

In [11]:
cpi_ser

month
1947-01     21.480
1947-02     21.620
1947-03     22.000
1947-04     22.000
1947-05     21.950
            ...   
2022-05    291.474
2022-06    295.328
2022-07    295.271
2022-08    295.620
2022-09    296.761
Freq: M, Name: CPIAUCSL, Length: 909, dtype: float64

In [12]:
# if cpi is not latest
latest_cpi_month = cpi_ser.index[-1]
latest_cpi_month += 1
while latest_cpi_month <= df['ann_year_month'].max():
    cpi_ser.loc[latest_cpi_month] = cpi_ser.iloc[-1]
    latest_cpi_month += 1

In [13]:
# create deal value adjusted by CPI index, base is '2020-12'
base = pd.Period('2020-12', 'M')
adj_factor = cpi_ser[base] / cpi_ser
# adjust deal value by CPI
dsk.insert_cols(df, 'val', 'val_adj_by_cpi', df.val.mul(adj_factor[df.ann_year_month].values))

In [14]:
# unfortunately period dtype is not supported by hdf
df = df.drop(columns=['ann_year_month'])

# Create new payment type `consido`

In [15]:
dsk.extract_all_payment_types(df.consido)

array(['Amer. Depy. Receipt', 'American Depy Share', 'Assets', 'Bond',
       'Capital Infusion', 'Capital Stock', 'Cash', 'Cash Dividend',
       'Cash Only', 'Common L.P. Unit', 'Common Stock',
       'Common Stock Type A', 'Common Stock Type B',
       'Common Stock Type C', 'Common Stock Type E',
       'Common Stock Type H', 'Conting. Value Right',
       'Contingent Val Pref', 'Convert. Debenture', 'Convert. Exch. Pfd.',
       'Convert. Preferred', 'Convert. Preferred A',
       'Convert. Preferred B', 'Convert. Preferred D',
       'Convert. Securities', 'Convert. Senior Pfd.',
       'Convert. Sub. Deb.', 'Convert. Sub. Note', 'Convertible Note',
       'Cum Cvt Exch Pfd', 'Cum. Convert. Pfd.', 'Cum. Exch. Preferred',
       'Cumulative Preferred', 'Cvt Cum Pfd Ser A',
       'Cvt. Promissory Note', 'Debenture', 'Debt', 'Debt Reduction',
       'Depositary Receipt', 'Depositary Share', 'Earnout',
       'Exch Redeemable Pfd', 'Exch. Preferred', 'Exch. Sub. Deb.',
       'Jr. S

In [16]:
lst_cash = ['Cash', 'Cash Only']
lst_stock = ['Common Stock', 'Common Stock Type A', 'Common Stock Type B', 'Common Stock Type C', 'Common Stock Type H', 'Newly Issued Ord Sh', 'Ordinary Share', ]

In [17]:
# # to be deal with later
# replace_lst_more = ['Amer. Depy. Receipt', 
#                     'American Depy Share',
#                     'Cash Dividend'
#                     'Common L.P. Unit',
#                    'Depositary Receipt',
#                     'Depositary Share',
#                    'L.P. Unit',
#                    'Unit']

In [18]:
# new consido
dsk.insert_cols(df, 'consido', 'consido_new', df.consido.map(lambda x: dsk.transform_payment_str(x, lst_cash, lst_stock), na_action='ignore'))
df.consido_new.value_counts()

Cash                     6489
Common Stock             2922
Cash and Common Stock    1695
No Cash or Stock          898
Name: consido_new, dtype: int64

# Extract cash and stock terms from consideration.

In [19]:
# convert `consid` column to a more readable format
df.consid = df.consid.map(dsk.convert_consid_to_readable, na_action='ignore')

# extract terms from `consid`
df['terms'] = dsk.extract_term_from_consid(df.consid)

In [20]:
# extract cash and stock terms
cash_stock_terms = dsk.apply_func_to_ser_df(df.terms, dsk.extract_cash_stock_from_term, return_as_df=True) 
dsk.insert_cols(df, ['consid']*3, cash_stock_terms.columns, cash_stock_terms)

100%|██████████████████████████████████| 10688/10688 [00:00<00:00, 14726.31it/s]


In [21]:
df.payment_type.value_counts()

Cash                                   5346
parse failed                           2413
Common Stock                           2154
Cash and Common Stock                   432
Common Stock, fixed dollar              240
Cash and Common Stock, fixed dollar     103
Name: payment_type, dtype: int64

In [22]:
# create stock indicator
df['stock'] = df.payment_type.isin(['Common Stock', 'Cash and Common Stock'])

# Save results

In [23]:
filepath = f"{path}/data/df_variable_transform.h5"

df.to_hdf(filepath, key = 'df', mode='w')