In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
# import wrds

pd.options.mode.chained_assignment = None

# Data Processing Notebooks Outline: 

We download raw M&A deals data from SDC platinum, and process them in several notebooks (in order):
- Notebook 0: basic cleaning.
- Notebook 1: match with CRSP database.
- Notebook 2: date correction.
- Notebook 3: create new variables.
- Notebook 4: apply filters

General guidelines for these data processing notebooks:
- We create new columns (variables)  on all the rows first, before applying  any  filters. 
- When filtering we should not drop any row directly, in case we want to retrieve them later. Instead we add another column called `retain` to indicate whether to retain the row after applying the filters. 
- These notebooks shall be highly modular, meaning that almost every data operation should be encapsulated in a function in the helper package. Each function is developed in another individual notebook (thus tens of development notebooks). In this way the end user only needs to read the comment without digging into the codes.
- From time to time we save the intermediate result as an `hdf` file, as some codes (especially those querying the CRSP database) need tens of minutes to run. Thus we want to run it just for once and store the results for later use. The advantage of `hdf` over `csv` is that it preserves data type like `datetime.date`. Only when we need to inspect the dataset by `Excel` or `Numbers` shall we save it as `csv`.


## Data Processing 0: Basic Cleaning
Specifically in this notebook we will do the following:

- Load column names from the report file. Load raw data. Change column names. 
- Transform date-like columns to `datetime.date` dtype. Transform float-like columns to float.
- Correct `consid` for some deals manually.
- Fill missing:
    - `pr_initial` by `pr`. 
    - `one_day` by the previous trading day to `dao`.

# Load data
## Load column names
Full column names in the raw data are too long and unwieldy to carry out python operations; thus we replace them with the acronyms in the database from the report file. Their correspondence is saved as a `csv` file called `column_names.csv`.  Another comprehensive file `SDC_MA_guide.pdf` explains the exact definition of all the variables in the database.

In [2]:
filepath = f"{path}/data/report.rpt"
# extract colnames from report file. The first name is the index name
colnames = dsk.extract_colnames_from_report_file(filepath)
# show the last 10 column names
colnames[-10:]

['pricebook',
 'eqvalcf',
 'eqvalsales',
 'eqval',
 'tlia',
 'cass',
 'clia',
 'lockup',
 'dae',
 'vest']

## Load raw data
Load raw data from the `csv` file. 

In [3]:
# load data
filepath = f"{path}/data/df.csv"
df = pd.read_csv(filepath, index_col=0, na_values=['nm', 'np'], low_memory=False)

## Change column names
Change column names. 

In [4]:
# full column names
colnames_full = list(map(lambda x: " ".join(x.split()).strip(), [df.index.name] + list(df.columns)))

# save the correspondence between acronym and full name for convenience
filepath = f"{path}/data/column_names.csv"
pd.Series(colnames_full, index=colnames, name='column name').to_csv(filepath)

In [5]:
# change column names
df.index.name = colnames[0]
df.columns = colnames[1:]

dsk.print_shape(df)
df.tail()

The dataset is of size (9854, 94).


Unnamed: 0_level_0,statc,one_day,aone_day,dao,da,dateannorig_days,de,dateeffexp,dw,definitive_agt,...,pricebook,eqvalcf,eqvalsales,eqval,tlia,cass,clia,lockup,dae,vest
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3846599020,C,12/14/21,12/14/21,12/15/21,12/15/21,0,07/06/22,07/06/22,,Yes,...,6.389,16.389,4.75,5699.204,1265.1,535.0,205.9,No,No,No
3847933020,C,12/16/21,,12/17/21,12/17/21,0,05/13/22,06/30/22,,Yes,...,5.573,59.34,5.089,2456.13,347.3,237.8,147.5,No,No,No
3848920020,C,07/09/21,12/17/21,07/11/21,12/19/21,161,03/31/22,04/01/22,,Yes,...,1.346,13.529,0.621,784.682,400.0,433.0,201.0,No,No,No
3847595020,C,12/15/21,12/17/21,12/16/21,12/20/21,4,06/08/22,06/06/22,,No,...,7.331,20.551,4.971,28373.205,3703.3,2440.2,1551.3,No,No,No
3851185020,C,,12/22/21,12/23/21,12/23/21,0,05/27/22,05/27/22,,Yes,...,14.716,11.498,2.984,6082.266,2905.9,943.8,487.7,No,No,No


# Transform date-like and float-like columns
Transform date-like columns to `datetime.date` dtype. Transform float-like columns to float.

In [6]:
# date-like columns to transform
cols_dt = ['one_day', 'aone_day', 'dao', 'da', 'de', 'dateeffexp', 'dw', 'da_date', 'dateval', 'dcom', 'dcomeff']

# apply function to each column
df[cols_dt] = df[cols_dt].apply(dsk.convert_date_str_ser_to_datetime)

# numeric-like columns to transform
cols_float = ['val', 'mv', 'amv', 'pr', 'ppmday', 'ppmwk', 'ppm4wk', 'roe', 'tlia', 'cass', 'clia']

# apply function to each column
df[cols_float] = df[cols_float].apply(dsk.convert_num_str_ser_to_float)

# Correct `consido` for some deals manually
Correct `consid` for some deals manually.

In [7]:
# correct data errors
cols = ['consid', 'consido']
df[cols] = dsk.correct_consid(df[cols])

# Fill missing 
## `pr_initial` by `pr`

In [8]:
# fill missing `pr_initial` by `pr`
df.pr_initial[df.pr_initial.isna()]=df.pr[df.pr_initial.isna()]

## `one_day` by the previous trading day to `dao`

In [9]:
# fill missing one_day by the previous trading day to <dao>
df.one_day[df.one_day.isna()] = dsk.get_trading_day_offset(df.dao[df.one_day.isna()], -1)

# Save results

In [10]:
filepath = f"{path}/data/df_basic_cleaning.h5"

df.to_hdf(filepath, key = 'df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['statc', 'one_day', 'aone_day', 'dao', 'da', 'de', 'dateeffexp', 'dw',
       'definitive_agt', 'da_date', 'dateval', 'dcom', 'dcomeff', 'att',
       'attitude_change_yn', 'initial_rec', 'tn', 'ttic', 'tcu', 'texch', 'an',
       'apub', 'atic', 'acu', 'anatc', 'aexch', 'cross', 'ttf_macro_desc',
       'ttf_mid_desc', 'atf_macro_desc', 'atf_mid_desc', 'valamend',
       'consid_struct_desc', 'consid', 'consido', 'consids', 'cha', 'tend',
       'term', 'synop', 'hdate', 'hosthprice', 'hval', 'hevent', 'hosthval',
       'competecode', 'competeval', 'lbo', 'afinancial', 'alp', 'aspv', 'awk',
       'hedge_fund_involv_yn', 'collar', 'lockup', 'dae', 'vest'],
      dtype='object')]

  df.to_hdf(filepath, key = 'df', mode='w')


# Combine with the CRSP result
If later we need to modify this notebook, we can combine the new result with the CRSP result in this block. This is to save the tens of minutes pulling data from CRSP.

In [11]:
# filepath = f"{path}/data/CRSP_results.h5"
# CRSP_results = pd.read_hdf(filepath)

# # combine CRSP result with the dataset after basic cleaning
# loc_names = ['ttic', 'ttic', 'tcu', 'tn'] + ['atic', 'atic', 'acu', 'an'] + ['att'] * 5
# dsk.insert_cols(df, loc_names, CRSP_results.columns, CRSP_results)

# filepath = f"{path}/data/df_merge_CRSP.h5"
# df.to_hdf(filepath, key = 'df', mode='w')