In [25]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [86]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.crsp import *
from MA_prediction.mkt_calendar import *
from sklearn.utils import _is_arraylike_not_scalar

In [135]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]: olivershu
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
Loading library list...
Done


# Data processing 1: CRSP:match `permno`

In this notebook we merge the dataset from SDC with CRSP database. CRSP is the database for historical prices of equities. Specifically we look for the `permno`, the unique identifier of a stock in CRSP, of every target and acquiror, by their ticker or cusip provided by SDC. Then we match the stock price data to make sure that the match is accurate. 

- first look for `permno` by ticker and `da`.
- pull out the data 5 days within `dao`, and match with the columns `['pr1day', 'tprday', 'tpr1daya']` in SDC. If they are close, then the match is successful.
- Otherwise look for `permno` by `cusip` and `da`, and then check stock price in the same manner.

We do the above for both target and acquiror.

## I/O

- Input: 
    - `df_basic_cleaning.h5`

- Output: 
    - `permno_CRSP.h5`
    - `delist_CRSP.h5`
    - `df_permno_delist_CRSP.h5`
    - `market_data_tgt_raw.pickle`, `market_data_acq_raw.pickle`

## Load data

In [4]:
# read hdf file
filepath = f"{path}/data/intermediate/df_basic_cleaning.h5"
df = pd.read_hdf(filepath)

print_shape(df)
df.tail()

The dataset is of size (10445, 95).


Unnamed: 0_level_0,statc,one_day,aone_day,dao,da,dateannorig_days,de,dateeffexp,dw,dr,...,pricebook,eqvalcf,eqvalsales,eqval,tlia,cass,clia,lockup,dae,vest
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3992461020,P,2022-10-24,2022-12-16,2022-10-25,2022-12-19,54,,2023-12-31,,,...,8.659,16.011,2.087,4547.2,1748.5,1001.7,802.9,No,No,No
4015877020,P,2022-12-16,2022-12-16,2022-12-19,2022-12-19,0,,2023-02-28,,,...,4.839,,0.752,16.141,18.3,14.2,16.4,No,No,No
4016515020,P,2022-12-19,2022-12-19,2022-12-20,2022-12-20,0,,2023-06-30,,,...,,,,52.581,,,,No,No,No
4017224020,P,2022-12-20,2022-12-20,2022-12-21,2022-12-21,0,,2023-03-31,,,...,0.75,,2.912,55.152,61.3,97.6,11.2,No,No,No
4019588020,P,2022-12-23,2022-12-23,2022-12-27,2022-12-27,0,,NaT,,,...,,,0.895,25.412,52.4,34.1,36.4,No,No,No


# Get target & acquiror permno
`permno` is the unique identifier for each security in CRSP database. We need to match each target and (public) acquiror with CRSP, by its ticker or cusip at the announcement day, to facilitate later use of CRSP.

In [5]:
# clean ticker and cusip
df.ttic = df.ttic.str.replace("'", "")
df.atic = df.atic.str.replace("'", "")
df.tcu = df.tcu.str.upper()
df.acu = df.acu.str.upper()

In [6]:
def get_stock_permno_from_ser(ser, id_type='ticker', db=None):
    """
    get the permno and stock information, by a series of (ticker, cusip, date)
    """
    return get_stock_permno_CRSP(ser.iloc[0], id_type=id_type, date=ser.iloc[1], return_names=True, db=db)

In [45]:
def get_stock_market_data_daily_CRSP_from_ser(ser, db=None):
    """
    ser = (permno, start_date, end_date).
    if length isn't 11 (stop trading within 5 days of dao), then return nan (delete the deal)
    """
    prc_ser = get_stock_market_data_daily_CRSP(ser.iloc[0], start_date=ser.iloc[1], end_date=ser.iloc[2], cols=['prc'], db=db)
    if _is_arraylike_not_scalar(prc_ser) and len(prc_ser) == 11:
        return prc_ser
    return np.nan

In [152]:
def match_permno(dff, price):
    """
    dff contains 'tic', 'cu', 'dao', 
    """
    dao_mius_5, dao_plus_5 = get_trading_day_offset(dff.dao, -5), get_trading_day_offset(dff.dao, 5)
    # results
    permno = pd.DataFrame(np.nan, columns=['permno', 'tic_CRSP', 'cu_CRSP', 'n_CRSP'], index=dff.index) 

    # first look for permno by ticker
    permno_by_ticker = apply_func_to_ser_df(dff[['tic', 'dao']], get_stock_permno_from_ser, return_as_df=True, id_type="ticker", db=db)
    permno_by_ticker.permno = permno_by_ticker.permno.replace({"Multiple permnos": np.nan})
    print(f"{permno_by_ticker.permno.notna().sum()} out of {len(dff)} deals find a permno by ticker")

    # concat
    df_concat_ticker = pd.concat([permno_by_ticker.permno, dao_mius_5, dao_plus_5], axis=1)
    price_process_ticker = apply_func_to_ser_df(df_concat_ticker, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(11), db=db)

    isclose_ticker = isclose_any_col_df_to_df(price_process_ticker, price, thres_percent=.05, thres_abs=2, logic='or')
    print(f"{isclose_ticker.sum()} deals out of {len(dff)} find the permno by ticker.")
    # save results
    permno.loc[isclose_ticker] = permno_by_ticker.loc[isclose_ticker].values

    # need to match these deals by cusip
    ind = dff.index[permno.permno.isna()]

    # second look for permno by cusip
    permno_by_cusip = apply_func_to_ser_df(dff.loc[ind, ['cu', 'dao']], get_stock_permno_from_ser, return_as_df=True, id_type="cusip", db=db)
    permno_by_cusip.permno = permno_by_cusip.permno.replace({"Multiple permnos": np.nan})
    print(f"{permno_by_cusip.permno.notna().sum()} out of {len(ind)} deals find a permno by cusip")

    # concat
    df_concat_cusip = pd.concat([permno_by_cusip.permno, dao_mius_5.loc[ind], dao_plus_5.loc[ind]], axis=1)
    price_process_cusip = apply_func_to_ser_df(df_concat_cusip, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(11), db=db)

    isclose_cusip = isclose_any_col_df_to_df(price_process_cusip, price.loc[ind], thres_percent=.05, thres_abs=2, logic='or')
    print(f"{isclose_cusip.sum()} deals out of {len(ind)} find the permno by cusip.")
    # save results
    isclose_cusip = isclose_cusip.reindex(dff.index).fillna(False)
    permno.loc[isclose_cusip] = permno_by_cusip.loc[isclose_cusip].values

    return permno, permno_by_ticker, permno_by_cusip

In [176]:
ind_a = df.aexch.isin(['Nasdaq', 'New York', 'American', 'NYSE Amex'])
df_a_input = df.loc[ind_a, ['atic', 'acu', 'dao']]
df_a_input.columns = ['tic', 'cu', 'dao']
apermno, apermno_by_ticker, apermno_by_cusip = match_permno(df_a_input.iloc[-500:], df.loc[ind_a, ['ac1day', 'aprday', 'apr1daya']].iloc[-500:])

100%|█████████████████████████████████████████| 500/500 [00:11<00:00, 41.89it/s]


478 out of 500 deals find a permno by ticker


100%|█████████████████████████████████████████| 500/500 [00:06<00:00, 72.32it/s]


452 deals out of 500 find the permno by ticker.


100%|███████████████████████████████████████████| 48/48 [00:01<00:00, 38.18it/s]


34 out of 48 deals find a permno by cusip


100%|██████████████████████████████████████████| 48/48 [00:00<00:00, 102.52it/s]

14 deals out of 48 find the permno by cusip.





In [189]:
apermno.columns = apermno.columns.map(lambda x: "a"+x)
loc_names = ['atic', 'atic', 'acu', 'an']
insert_cols(df, loc_names, None, apermno)

In [191]:
apermno.columns = apermno.columns.map(lambda x: "a"+x)
loc_names = ['atic', 'atic', 'acu', 'an']
insert_cols(df, loc_names, None, apermno)

apermno.columns = apermno.columns.map(lambda x: "a"+x)
loc_names = ['ttic', 'ttic', 'tcu', 'tn'] 
insert_cols(df, loc_names, None, tpermno)

In [147]:
apermno_by_ticker

Unnamed: 0_level_0,permno,ticker,cusip,comnam
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3492916020,78840.0,IAC,44891N20,IAC INTERACTIVECORP
3493668020,92550.0,BIP,G1625210,BROOKFIELD INFRASTRUC PARTNER LP
3496072020,92443.0,NFBK,66611T10,NORTHFIELD BANCORP INC DE
3493696020,,,,
3494334020,86876.0,WCC,95082P10,WESCO INTERNATIONAL INC
...,...,...,...,...
3992461020,25582.0,LHX,50243110,L3HARRIS TECHNOLOGIES INC
4015877020,,,,
4016515020,85656.0,MPB,59540G10,MID PENN BANCORP INC
4017224020,19504.0,BLI,08431010,BERKELEY LIGHTS INC


In [141]:
500-227

273

In [144]:
apermno_by_ticker.permno.notna().sum()

252

In [143]:
apermno_by_ticker.permno.isna().sum()

248

In [107]:
# results
tpermno = pd.DataFrame(np.nan, columns=['tpermno', 'ttic_CRSP', 'tcu_CRSP', 'tn_CRSP'], index=df.index)

In [7]:
# first look for permno by ticker
tpermno_by_ticker = apply_func_to_ser_df(df[['ttic', 'da']], get_stock_permno_from_ser, return_as_df=True, id_type="ticker", db=db)
tpermno_by_ticker.permno = tpermno_by_ticker.permno.replace({"Multiple permnos": np.nan})
print(f"{tpermno_by_ticker.permno.notna().sum()} out of {len(df)} deals find a permno by ticker")

100%|█████████████████████████████████████| 10445/10445 [08:38<00:00, 20.13it/s]


In [42]:
dao_mius_5, dao_plus_5 = get_trading_day_offset(df.dao, -5), get_trading_day_offset(df.dao, 5)
df_concat_ticker = pd.concat([tpermno_by_ticker.permno, dao_mius_5, dao_plus_5], axis=1)

In [46]:
price_process_ticker = apply_func_to_ser_df(df_concat_ticker, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(11), db=db)

100%|█████████████████████████████████████| 10445/10445 [05:40<00:00, 30.65it/s]


In [99]:
isclose_ticker = isclose_any_col_df_to_df(price_process_ticker, df[['pr1day', 'tprday', 'tpr1daya']], thres_percent=.05, thres_abs=2, logic='or')
print(f"{isclose_ticker.sum()} deals out of {len(df)} find the permno by ticker.")
# save results
tpermno.loc[isclose_ticker] = tpermno_by_ticker.loc[isclose_ticker].values

In [111]:
# need to match these deals by cusip
ind = df.index[tpermno.tpermno.isna()]

In [116]:
# second look for permno by cusip
tpermno_by_cusip = apply_func_to_ser_df(df.loc[ind, ['tcu', 'da']], get_stock_permno_from_ser, return_as_df=True, id_type="cusip", db=db)
print(f"{tpermno_by_cusip.permno.notna().sum()} out of {len(tpermno_by_cusip)} deals find a permno by cusip")

100%|███████████████████████████████████████| 1178/1178 [00:34<00:00, 33.82it/s]


In [123]:
tpermno_by_cusip.permno = tpermno_by_cusip.permno.replace({"Multiple permnos": np.nan})

In [124]:
df_concat_cusip = pd.concat([tpermno_by_cusip.permno, dao_mius_5.loc[ind], dao_plus_5.loc[ind]], axis=1)
price_process_cusip = apply_func_to_ser_df(df_concat_cusip, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(11), db=db)

100%|███████████████████████████████████████| 1178/1178 [00:16<00:00, 71.70it/s]


In [132]:
isclose_cusip = isclose_any_col_df_to_df(price_process_cusip, df.loc[ind, ['pr1day', 'tprday', 'tpr1daya']], thres_percent=.05, thres_abs=2, logic='or')
print(f"{isclose_cusip.sum()} deals out of {len(isclose_cusip)} find the permno by ticker.")
# save results
isclose_cusip = isclose_cusip.reindex(df.index).fillna(False)
tpermno.loc[isclose_cusip] = tpermno_by_ticker.loc[isclose_cusip].values

352 deals out of 1178 find the permno by ticker.


In [None]:
get_stock_market_data_daily_CRSP()

In [6]:
def get_stock_permno_from_ser(ser, db=None):
    """
    get the permno and stock information, by a series of (ticker, cusip, date)
    """
    return get_stock_permno_by_ticker_and_cusip_CRSP(ser.iloc[0], ser.iloc[1], date=ser.iloc[2], return_names=True, db=db)

In [7]:
# get target permno, by its ticker, cusip and the announcement date
tpermno_match = apply_func_to_ser_df(df[['ttic', 'tcu', 'da']], get_stock_permno_from_ser, return_as_df=True, db=db)
tpermno_match.columns = ['tpermno', 'ttic_CRSP', 'tcu_CRSP', 'tn_CRSP']
print(f"{tpermno_match.tpermno.isnull().sum()} targets out of {len(df)} cannot find a permno.")

100%|█████████████████████████████████████| 10445/10445 [08:41<00:00, 20.01it/s]


368 targets out of 10445 cannot find a permno.


In [8]:
# get acquiror permno, by its ticker, cusip and the announcement date
apermno_match = apply_func_to_ser_df(df[['atic', 'acu', 'da']], get_stock_permno_from_ser, return_as_df=True, db=db)
apermno_match.columns = ['apermno', 'atic_CRSP', 'acu_CRSP', 'an_CRSP']
print(f"{apermno_match.apermno.isnull().sum()} acquirors out of {len(df)} cannot find a permno.")

100%|█████████████████████████████████████| 10445/10445 [08:46<00:00, 19.85it/s]


4060 acquirors out of 10445 cannot find a permno.


## Save intermediate results

We combine the permno search results and save it as a file.

In [9]:
# concat permno search results, and save it
permno_CRSP = pd.concat([tpermno_match, apermno_match], axis=1)
# save
pathfile = f"{path}/data/intermediate/permno_CRSP.h5"
permno_CRSP.to_hdf(pathfile, key = 'permno_CRSP', mode='w')

# concat into the dataset
loc_names = ['ttic', 'ttic', 'tcu', 'tn'] + ['atic', 'atic', 'acu', 'an']
insert_cols(df, loc_names, None, permno_CRSP)

# Get target delisting information

In [10]:
# columns = ['delist_code', 'last_trade_date', 'delist_date', 'delist_amount', 'delist_return']
delist_CRSP = apply_func_to_ser_df(df.tpermno, get_delisting_information, return_as_df=True, db=db)
complete = df.statc.eq('C')
print(f"{(complete & delist_CRSP.delist_return.isnull()).sum()} out of {complete.sum()} completed deals cannot find delisting returns.")

100%|█████████████████████████████████████| 10077/10077 [04:37<00:00, 36.38it/s]


392 out of 8096 completed deals cannot find delisting returns.


In [11]:
## Save intermediate results
pathfile = f"{path}/data/intermediate/delist_CRSP.h5"
delist_CRSP.to_hdf(pathfile, key = 'delist_CRSP', mode='w')
# concat into the dataset
loc_names = ['att'] * 5
insert_cols(df, loc_names, None, delist_CRSP)

In [12]:
# save
filepath = f"{path}/data/intermediate/df_permno_delist_CRSP.h5"
df.to_hdf(filepath, key = 'df', mode='w')

# pull raw market data

In [13]:
df['dao_40days_prior'] = get_trading_day_offset(df.dao, -40)
last_date = max(df.da.max(), df.dr.dropna().max())
df['dr_40days_after'] = get_trading_day_offset(df.dr.fillna(last_date), 40)

In [14]:
def get_stock_market_data_daily_from_ser_CRSP(ser, db=db):
    """
    get daily market data from a series of (permno, start_date, end_date)
    """
    return get_stock_market_data_daily_CRSP(ser.iloc[0], start_date=ser.iloc[1], end_date=ser.iloc[2], db=db)

In [15]:
mkt_data_tgt = apply_func_to_ser_df(df[['tpermno', 'dao_40days_prior', 'dr_40days_after']], 
                                    get_stock_market_data_daily_from_ser_CRSP, 
                                    return_as_df=False, 
                                    db=db)
print(f"{mkt_data_tgt.isna().sum()} out of {len(df)} deals cannot find target market data.")

100%|█████████████████████████████████████| 10445/10445 [08:03<00:00, 21.61it/s]


368 out of 10445 deals cannot find target market data.


In [16]:
mkt_data_acq = apply_func_to_ser_df(df[['apermno', 'dao_40days_prior', 'dr_40days_after']],
                                       get_stock_market_data_daily_from_ser_CRSP,
                                       return_as_df=False,
                                       db=db)
print(f"{mkt_data_acq.isna().sum()} out of {len(df)} deals cannot find acquiror market data.")

100%|█████████████████████████████████████| 10445/10445 [04:48<00:00, 36.23it/s]


4060 out of 10445 deals cannot find acquiror market data.


## save raw market data

In [17]:
import pickle

filepath_tgt = f"{path}/data/raw/mkt_data_tgt_raw.pickle"
with open(filepath_tgt, 'wb') as handle:
    pickle.dump(mkt_data_tgt, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
filepath_acq = f"{path}/data/raw/mkt_data_acq_raw.pickle"
with open(filepath_acq, 'wb') as handle:
    pickle.dump(mkt_data_acq, handle, protocol=pickle.HIGHEST_PROTOCOL)