In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.crsp import *
from MA_prediction.mkt_calendar import *

In [3]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]: olivershu
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
Loading library list...
Done


# Data processing 1: CRSP

In this notebook we merge the dataset from SDC with CRSP database. CRSP is the database for historical prices of equities. 

Specifically we will do the following:

- Look for the `permno` of all the targets and acquirors. `permno` is CRSP's own unique identifier for every stock.
- Look for delisting information (delisting code, last trade date, delisting date and delisting returns) in CRSP database, for all the targets.
- pull raw market data for all the targets and acquirors, from 40 (trading) days prior to `dao`, to 40 days after `dr`.

These market data will be used for backtesting after cleaning. For pending deals where `dr` is not available, we use the latest `de` or `da` in the dataset to indicate the last date.

## I/O

- Input: 
    - `df_basic_cleaning.h5`

- Output: 
    - `permno_CRSP.h5`
    - `delist_CRSP.h5`
    - `df_permno_delist_CRSP.h5`
    - `market_data_tgt_raw.pickle`, `market_data_acq_raw.pickle`

## Load data

In [4]:
# read hdf file
filepath = f"{path}/data/intermediate/df_basic_cleaning.h5"
df = pd.read_hdf(filepath)

print_shape(df)
df.tail()

The dataset is of size (12079, 95).


Unnamed: 0_level_0,statc,one_day,aone_day,dao,da,dateannorig_days,de,dateeffexp,dw,dr,...,pricebook,eqvalcf,eqvalsales,eqval,tlia,cass,clia,lockup,dae,vest
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3992461020,P,2022-10-24,2022-12-16,2022-10-25,2022-12-19,54,,2023-12-31,,,...,8.659,16.011,2.087,4547.2,1748.5,1001.7,802.9,No,No,No
4015877020,P,2022-12-16,2022-12-16,2022-12-19,2022-12-19,0,,2023-02-28,,,...,4.839,,0.752,16.141,18.3,14.2,16.4,No,No,No
4016515020,P,2022-12-19,2022-12-19,2022-12-20,2022-12-20,0,,2023-06-30,,,...,,,,52.581,,,,No,No,No
4017224020,P,2022-12-20,2022-12-20,2022-12-21,2022-12-21,0,,2023-03-31,,,...,0.75,,2.912,55.152,61.3,97.6,11.2,No,No,No
4019588020,P,2022-12-23,2022-12-23,2022-12-27,2022-12-27,0,,NaT,,,...,,,0.895,25.412,52.4,34.1,36.4,No,No,No


# Get target & acquiror permno
`permno` is the unique identifier for each security in CRSP database. We need to match each target and (public) acquiror with CRSP, by its ticker or cusip at the announcement day, to facilitate later use of CRSP.

In [5]:
# clean ticker and cusip
df.ttic = df.ttic.str.replace("'", "")
df.atic = df.atic.str.replace("'", "")
df.tcu = df.tcu.str.upper()
df.acu = df.acu.str.upper()

In [6]:
def get_stock_permno_from_ser(ser, db=None):
    """
    get the permno and stock information, by a series of (ticker, cusip, date)
    """
    return get_stock_permno_by_ticker_and_cusip_CRSP(ser.iloc[0], ser.iloc[1], date=ser.iloc[2], return_names=True, db=db)

In [7]:
# get target permno, by its ticker, cusip and the announcement date
tpermno_match = apply_func_to_ser_df(df[['ttic', 'tcu', 'da']], get_stock_permno_from_ser, return_as_df=True, db=db)
tpermno_match.columns = ['tpermno', 'ttic_CRSP', 'tcu_CRSP', 'tn_CRSP']
print(f"{tpermno_match.tpermno.isnull().sum()} targets out of {len(df)} cannot find a permno.")

100%|█████████████████████████████████████| 12079/12079 [05:45<00:00, 34.92it/s]


893 targets out of 12079 cannot find a permno.


In [8]:
# get acquiror permno, by its ticker, cusip and the announcement date
apermno_match = apply_func_to_ser_df(df[['atic', 'acu', 'da']], get_stock_permno_from_ser, return_as_df=True, db=db)
apermno_match.columns = ['apermno', 'atic_CRSP', 'acu_CRSP', 'an_CRSP']
print(f"{apermno_match.apermno.isnull().sum()} acquirors out of {len(df)} cannot find a permno.")

100%|█████████████████████████████████████| 12079/12079 [06:14<00:00, 32.25it/s]


5068 acquirors out of 12079 cannot find a permno.


## Save intermediate results

We combine the permno search results and save it as a file.

In [9]:
# concat permno search results, and save it
permno_CRSP = pd.concat([tpermno_match, apermno_match], axis=1)
# save
pathfile = f"{path}/data/intermediate/permno_CRSP.h5"
permno_CRSP.to_hdf(pathfile, key = 'permno_CRSP', mode='w')

# concat into the dataset
loc_names = ['ttic', 'ttic', 'tcu', 'tn'] + ['atic', 'atic', 'acu', 'an']
insert_cols(df, loc_names, None, permno_CRSP)

# Get target delisting information

In [10]:
# columns = ['delist_code', 'last_trade_date', 'delist_date', 'delist_amount', 'delist_return']
delist_CRSP = apply_func_to_ser_df(df.tpermno, get_delisting_information, return_as_df=True, db=db)
complete = df.statc.eq('C')
print(f"{(complete & delist_CRSP.delist_return.isnull()).sum()} out of {complete.sum()} completed deals cannot find delisting returns.")

100%|█████████████████████████████████████| 11186/11186 [02:40<00:00, 69.90it/s]


865 out of 8967 completed deals cannot find delisting returns.


In [11]:
## Save intermediate results
pathfile = f"{path}/data/intermediate/delist_CRSP.h5"
delist_CRSP.to_hdf(pathfile, key = 'delist_CRSP', mode='w')
# concat into the dataset
loc_names = ['att'] * 5
insert_cols(df, loc_names, None, delist_CRSP)

In [12]:
# save
filepath = f"{path}/data/intermediate/df_permno_delist_CRSP.h5"
df.to_hdf(filepath, key = 'df', mode='w')

# pull raw market data

In [13]:
df['dao_40days_prior'] = get_trading_day_offset(df.dao, -40)
last_date = max(df.da.max(), df.dr.dropna().max())
df['dr_40days_after'] = get_trading_day_offset(df.dr.fillna(last_date), 40)

In [14]:
def get_stock_market_data_daily_from_ser_CRSP(ser, db=db):
    """
    get daily market data from a series of (permno, start_date, end_date)
    """
    return get_stock_market_data_daily_CRSP(ser.iloc[0], start_date=ser.iloc[1], end_date=ser.iloc[2], db=db)

In [15]:
mkt_data_tgt = apply_func_to_ser_df(df[['tpermno', 'dao_40days_prior', 'dr_40days_after']], 
                                    get_stock_market_data_daily_from_ser_CRSP, 
                                    return_as_df=False, 
                                    db=db)
print(f"{mkt_data_tgt.isna().sum()} out of {len(df)} deals cannot find target market data.")

100%|█████████████████████████████████████| 12079/12079 [03:56<00:00, 51.13it/s]


893 out of 12079 deals cannot find target market data.


In [16]:
mkt_data_acq = apply_func_to_ser_df(df[['apermno', 'dao_40days_prior', 'dr_40days_after']],
                                       get_stock_market_data_daily_from_ser_CRSP,
                                       return_as_df=False,
                                       db=db)
print(f"{mkt_data_acq.isna().sum()} out of {len(df)} deals cannot find acquiror market data.")

100%|█████████████████████████████████████| 12079/12079 [02:07<00:00, 94.88it/s]


5068 out of 12079 deals cannot find acquiror market data.


## save raw market data

In [17]:
import pickle

filepath_tgt = f"{path}/data/raw/mkt_data_tgt_raw.pickle"
with open(filepath_tgt, 'wb') as handle:
    pickle.dump(mkt_data_tgt, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
filepath_acq = f"{path}/data/raw/mkt_data_acq_raw.pickle"
with open(filepath_acq, 'wb') as handle:
    pickle.dump(mkt_data_acq, handle, protocol=pickle.HIGHEST_PROTOCOL)