In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
import wrds

pd.options.mode.chained_assignment = None

In [2]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]:olivershu
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
Loading library list...
Done


# Data processing 1: Merge with CRSP

In this notebook we merge the deals dataset from SDC with CRSP database. CRSP is the database for historical prices of securities especially equities. 

Specifically we will do the following:

- Look for the `permno` of targets and acquirors in CRSP database. `permno` is CRSP's own identifier for each equity.
- For completed deals, look for delisting code, dates and returns in CRSP database.

## Load data

In [3]:
# read hdf file
filepath = f"{path}/data/df_basic_cleaning.h5"
df = pd.read_hdf(filepath)

dsk.print_shape(df)
df.tail()

The dataset is of size (9854, 94).


Unnamed: 0_level_0,statc,one_day,aone_day,dao,da,dateannorig_days,de,dateeffexp,dw,definitive_agt,...,pricebook,eqvalcf,eqvalsales,eqval,tlia,cass,clia,lockup,dae,vest
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3846599020,C,2021-12-14,2021-12-14,2021-12-15,2021-12-15,0,2022-07-06,2022-07-06,NaT,Yes,...,6.389,16.389,4.75,5699.204,1265.1,535.0,205.9,No,No,No
3847933020,C,2021-12-16,NaT,2021-12-17,2021-12-17,0,2022-05-13,2022-06-30,NaT,Yes,...,5.573,59.34,5.089,2456.13,347.3,237.8,147.5,No,No,No
3848920020,C,2021-07-09,2021-12-17,2021-07-11,2021-12-19,161,2022-03-31,2022-04-01,NaT,Yes,...,1.346,13.529,0.621,784.682,400.0,433.0,201.0,No,No,No
3847595020,C,2021-12-15,2021-12-17,2021-12-16,2021-12-20,4,2022-06-08,2022-06-06,NaT,No,...,7.331,20.551,4.971,28373.205,3703.3,2440.2,1551.3,No,No,No
3851185020,C,2021-12-22,2021-12-22,2021-12-23,2021-12-23,0,2022-05-27,2022-05-27,NaT,Yes,...,14.716,11.498,2.984,6082.266,2905.9,943.8,487.7,No,No,No


# Get target & acquiror permno
`permno` is the unique identifier for each security in CRSP database. We need to match each target and (public) acquiror with CRSP, by its ticker or cusip at the announcement day, to facilitate later use of CRSP.

In [4]:
# process ticker and cusip
df.ttic = df.ttic.str.replace("'", "")
df.atic = df.atic.str.replace("'", "")
df.tcu = df.tcu.str.upper()
df.acu = df.acu.str.upper()

In [5]:
# takes 5-10 mins to run
columns = ['tpermno', 'ttic_CRSP', 'tcu_CRSP', 'tn_CRSP']
tpermno_match = dsk.apply_func_to_ser(df[['ttic', 'tcu', 'da', 'da']], 
                                      dsk.get_stock_permno_by_ticker_and_cusip_CRSP, 
                                      return_as_df=True, 
                                      columns=columns, 
                                      db=db, 
                                      return_id=True)

100%|███████████████████████████████████████| 9854/9854 [05:46<00:00, 28.45it/s]


In [6]:
# takes 5-10 mins to run
columns = ['apermno', 'atic_CRSP', 'acu_CRSP', 'an_CRSP']
apermno_match = dsk.apply_func_to_ser(df[['atic', 'acu', 'da', 'da']], 
                                      dsk.get_stock_permno_by_ticker_and_cusip_CRSP, 
                                      return_as_df=True, 
                                      columns=columns, 
                                      db=db, 
                                      return_id=True)

100%|███████████████████████████████████████| 9854/9854 [07:46<00:00, 21.11it/s]


## save results

In [7]:
# filepath = f"{path}/data/df_tpermno.h5"
# df_tpermno_match.to_hdf(filepath, key='tpermno')

# filepath = f"{path}/data/df_apermno.h5"
# df_apermno_match.to_hdf(filepath, key='apermno')

# Get target delisting information

In [8]:
columns = ['delist_code', 'last_trade_date', 'delist_date', 'delist_amount', 'delist_return']
delist = dsk.apply_func_to_ser(tpermno_match.tpermno[df.statc.eq('C')], 
                               dsk.get_delisting_information, 
                               return_as_df=True, 
                               columns=columns, 
                               db=db)

100%|███████████████████████████████████████| 7807/7807 [02:14<00:00, 57.90it/s]


# Save results
We combine all the three results from CRSP database, and save it as a file.

In [9]:
pathfile = f"{path}/data/CRSP_results.h5"
# concat three files from CRSP database
CRSP_results = pd.concat([tpermno_match, apermno_match, delist], axis=1)
CRSP_results.to_hdf(pathfile, key = 'CRSP', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['ttic_CRSP', 'tcu_CRSP', 'tn_CRSP', 'atic_CRSP', 'acu_CRSP', 'an_CRSP',
       'last_trade_date', 'delist_date'],
      dtype='object')]

  CRSP_results.to_hdf(pathfile, key = 'CRSP', mode='w')


We also save a file combining the CRSP result and the dataset after basic cleaning.

In [10]:
# load dataset after basic cleaning
filepath = f"{path}/data/df_basic_cleaning.h5"
df = pd.read_hdf(filepath)

# combine CRSP result with the dataset after basic cleaning
loc_names = ['ttic', 'ttic', 'tcu', 'tn'] + ['atic', 'atic', 'acu', 'an'] + ['att'] * 5
dsk.insert_cols(df, loc_names, CRSP_results.columns, CRSP_results)

filepath = f"{path}/data/df_merge_CRSP.h5"
df.to_hdf(filepath, key = 'df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['statc', 'one_day', 'aone_day', 'dao', 'da', 'de', 'dateeffexp', 'dw',
       'definitive_agt', 'da_date', 'dateval', 'dcom', 'dcomeff',
       'last_trade_date', 'delist_date', 'att', 'attitude_change_yn',
       'initial_rec', 'tn_CRSP', 'tn', 'ttic_CRSP', 'ttic', 'tcu_CRSP', 'tcu',
       'texch', 'an_CRSP', 'an', 'apub', 'atic_CRSP', 'atic', 'acu_CRSP',
       'acu', 'anatc', 'aexch', 'cross', 'ttf_macro_desc', 'ttf_mid_desc',
       'atf_macro_desc', 'atf_mid_desc', 'valamend', 'consid_struct_desc',
       'consid', 'consido', 'consids', 'cha', 'tend', 'term', 'synop', 'hdate',
       'hosthprice', 'hval', 'hevent', 'hosthval', 'competecode', 'competeval',
       'lbo', 'afinancial', 'alp', 'aspv', 'awk', 'hedge_fund_involv_yn',
       'collar', 'lockup', 'dae', 'vest'],
      dtype='object')]

  df.to_hdf(filepath, key = 