In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from MA_prediction.utils import *
from MA_prediction.mkt_calendar import *
from MA_prediction.preprocessing import *
from MA_prediction.crsp import *
from sklearn.utils import _is_arraylike_not_scalar

In [3]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]: olivershu
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
Loading library list...
Done


# data-2: CRSP, look for `permno`

In this notebook we use tickers and cusips provided by SDC, to find the `permno` of stocks in CRSP. CRSP is the database for historical prices of equities, and its unique identifier for every stock is `permno`. We need to find this identifier as it facilitates pulling market data from CRSP and backtesting. Apart from ticker and cusips, SDC also provides a few market prices (4wk, 1wk, 1day, before/after announcement), which we use to verify that the match is indeed accurate.

We do the following for both targets and acquirors.

- first look for `permno` by ticker and `da`.
- pull out the market prices 5 days before and after `dao`, and match with the columns `['pr1day', 'tprday', 'tpr1daya']` in SDC. If they are close, then the match is successful.
- Otherwise look for `permno` by `cusip` and `da`, and then check stock price in the same manner.


## I/O

- Input: 
    - `data/intermediate/df_basic_processing.h5`

- Output: 
    - All in `data/intermediate/CRSP/`:
        - `tpermno.h5`, `apermno.h5`
        - `tpermno_by_ticker.h5`, `apermno_by_ticker.h5`
        - `tpermno_by_cusip.h5`, `apermno_by_cusip.h5`

## Load data

In [4]:
# read hdf file
filepath = f"{path}/data/intermediate/df_basic_processing.h5"
df = pd.read_hdf(filepath)

print_shape(df)
df.tail()

The dataset is of size (10446, 95).


Unnamed: 0_level_0,statc,one_day,aone_day,dao,da,dateannorig_days,de,dateeffexp,dw,dr,...,pricebook,eqvalcf,eqvalsales,eqval,tlia,cass,clia,lockup,dae,vest
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3992461020,P,2022-10-24,2022-12-16,2022-10-25,2022-12-19,54,,2023-12-31,,,...,8.659,16.011,2.087,4547.2,1748.5,1001.7,802.9,No,No,No
4015877020,P,2022-12-16,2022-12-16,2022-12-19,2022-12-19,0,,2023-02-28,,,...,4.839,,0.752,16.141,18.3,14.2,16.4,No,No,No
4016515020,P,2022-12-19,2022-12-19,2022-12-20,2022-12-20,0,,2023-06-30,,,...,,,,52.581,,,,No,No,No
4017224020,P,2022-12-20,2022-12-20,2022-12-21,2022-12-21,0,,2023-03-31,,,...,0.75,,2.912,55.152,61.3,97.6,11.2,No,No,No
4019588020,P,2022-12-23,2022-12-23,2022-12-27,2022-12-27,0,,NaT,,,...,,,0.895,25.412,52.4,34.1,36.4,No,No,No


# Get target & acquiror permno
`permno` is the unique identifier for each security in CRSP database. We need to match each target and (public) acquiror with CRSP, by its ticker or cusip at the announcement day, to facilitate later use of CRSP.

In [5]:
def get_stock_permno_from_ser(ser, id_type='ticker', db=None):
    """
    ser = (id, date)
    get the permno and stock information, by a series of (id, date)
    """
    return get_stock_permno_CRSP(ser.iloc[0], id_type=id_type, date=ser.iloc[1], return_names=True, db=db)

In [6]:
def get_stock_market_data_daily_CRSP_from_ser(ser, num_days = 11, db=None):
    """
    ser = (permno, start_date, end_date).
    if length isn't 11 (stop trading within 5 days of dao), then return nan (delete the deal)
    """
    prc_ser = get_stock_market_data_daily_CRSP(ser.iloc[0], start_date=ser.iloc[1], end_date=ser.iloc[2], cols=['prc'], db=db)
    if _is_arraylike_not_scalar(prc_ser) and len(prc_ser) == num_days:
        return prc_ser
    return np.nan

In [7]:
def match_permno(dff, price, num_days = 11):
    """
    dff contains 'tic', 'cu', 'date', 
    """
    date_mius_5, date_plus_5 = get_trading_day_offset(dff.date, -5), get_trading_day_offset(dff.date, 5)
    # results
    permno = pd.DataFrame(np.nan, columns=['permno', 'tic_CRSP', 'cu_CRSP', 'n_CRSP'], index=dff.index) 

    # first look for permno by ticker
    permno_by_ticker = apply_func_to_ser_df(dff[['tic', 'date']], get_stock_permno_from_ser, return_as_df=True, id_type="ticker", db=db)
    permno_by_ticker.permno = permno_by_ticker.permno.replace({"Multiple permnos": np.nan})
    print(f"{permno_by_ticker.permno.notna().sum()} out of {len(dff)} deals find a permno by ticker")

    # concat
    df_concat_ticker = pd.concat([permno_by_ticker.permno, date_mius_5, date_plus_5], axis=1)
    price_process_ticker = apply_func_to_ser_df(df_concat_ticker, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(num_days), db=db)

    isclose_ticker = isclose_any_col_df_to_df(price_process_ticker, price, thres_percent=.05, thres_abs=1, logic='or')
    print(f"{isclose_ticker.sum()} deals out of {len(dff)} find a permno by ticker.")
    # save results
    permno.loc[isclose_ticker] = permno_by_ticker.loc[isclose_ticker].values

    # need to match these deals by cusip
    ind = dff.index[permno.permno.isna()]

    # second look for permno by cusip
    permno_by_cusip = apply_func_to_ser_df(dff.loc[ind, ['cu', 'date']], get_stock_permno_from_ser, return_as_df=True, id_type="cusip", db=db)
    permno_by_cusip.permno = permno_by_cusip.permno.replace({"Multiple permnos": np.nan})
    print(f"{permno_by_cusip.permno.notna().sum()} out of {len(ind)} deals find a permno by cusip")

    # concat
    df_concat_cusip = pd.concat([permno_by_cusip.permno, date_mius_5.loc[ind], date_plus_5.loc[ind]], axis=1)
    price_process_cusip = apply_func_to_ser_df(df_concat_cusip, get_stock_market_data_daily_CRSP_from_ser, return_as_df=True, use_new_cols=range(num_days), db=db)

    isclose_cusip = isclose_any_col_df_to_df(price_process_cusip, price.loc[ind], thres_percent=.05, thres_abs=1, logic='or')
    print(f"{isclose_cusip.sum()} deals out of {len(ind)} find the permno by cusip.")
    # save results
    isclose_cusip = isclose_cusip.reindex(dff.index).fillna(False)
    permno.loc[isclose_cusip] = permno_by_cusip.loc[isclose_cusip].values

    return permno, permno_by_ticker, permno_by_cusip

In [8]:
# tgt
df_t_input = df[['ttic', 'tcu', 'dao']]
df_t_input.columns = ['tic', 'cu', 'date']
tpermno, tpermno_by_ticker, tpermno_by_cusip = match_permno(df_t_input, df[['pr1day', 'tprday', 'tpr1daya']])

100%|█████████████████████████████████████| 10446/10446 [04:45<00:00, 36.53it/s]


9703 out of 10446 deals find a permno by ticker


100%|█████████████████████████████████████| 10446/10446 [02:49<00:00, 61.76it/s]


9258 deals out of 10446 find a permno by ticker.


100%|███████████████████████████████████████| 1188/1188 [00:37<00:00, 31.56it/s]


737 out of 1188 deals find a permno by cusip


100%|██████████████████████████████████████| 1188/1188 [00:10<00:00, 111.85it/s]

356 deals out of 1188 find the permno by cusip.





In [9]:
# acq
ind_a = df.aexch.isin(['Nasdaq', 'New York', 'American', 'NYSE Amex'])
df_a_input = df.loc[ind_a, ['atic', 'acu', 'da']]
df_a_input.columns = ['tic', 'cu', 'date']
apermno, apermno_by_ticker, apermno_by_cusip = match_permno(df_a_input, df.loc[ind_a, ['ac1day', 'aprday', 'apr1daya']])

100%|███████████████████████████████████████| 5904/5904 [02:29<00:00, 39.53it/s]


5454 out of 5904 deals find a permno by ticker


100%|███████████████████████████████████████| 5904/5904 [01:36<00:00, 61.20it/s]


5152 deals out of 5904 find a permno by ticker.


100%|█████████████████████████████████████████| 752/752 [00:18<00:00, 39.77it/s]


557 out of 752 deals find a permno by cusip


100%|█████████████████████████████████████████| 752/752 [00:07<00:00, 98.25it/s]


303 deals out of 752 find the permno by cusip.


In [10]:
path_p = f"{path}/data/intermediate/CRSP"
tpermno.columns = tpermno.columns.map(lambda x: "t"+x)
apermno.columns = apermno.columns.map(lambda x: "a"+x)
tpermno.to_hdf(f"{path_p}/tpermno.h5", key = 'tpermno', mode='w')
apermno.to_hdf(f"{path_p}/apermno.h5", key = 'apermno', mode='w')

In [11]:
tpermno_by_ticker.to_hdf(f"{path_p}/tpermno_by_ticker.h5", key = 'tpermno_by_ticker', mode='w')
apermno_by_ticker.to_hdf(f"{path_p}/apermno_by_ticker.h5", key = 'apermno_by_ticker', mode='w')
tpermno_by_cusip.to_hdf(f"{path_p}/tpermno_by_cusip.h5", key = 'tpermno_by_cusip', mode='w')
apermno_by_cusip.to_hdf(f"{path_p}/apermno_by_cusip.h5", key = 'apermno_by_cusip', mode='w')