In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
# import wrds

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Data preprocessing 5: Process market data

In this notebook we process the market data pulled from CRSP. Main issues include:

- For completed deals, add the delisting return.
- Negative price: no closing price for that day. Just take absolute value, and return is normal in this situation.
- Missing prices and returns: during some days when trading of a certain stock is not allowed, the prices and returns would be missing.
    - for a single day without trading, fill missing price by previous prices and fill missing return with 0.
    - for a long time (da to da+5) without trading, throw away the data.
- Adjust raw price by the cumulative factor, based on the factor on the announcement day.
- calculate mktcap.
    
    
## I/O

- Input:
    - `df_variable_transform.h5`
    - `market_data_tgt_raw.pickle`
    - `market_data_acq_raw.pickle`
- Output:
    - `market_data_tgt_corrected.pickle`
    - `market_data_acq_corrected.pickle`

## load data

In [2]:
filepath = f"{path}/data/df_variable_transform.h5"
df = pd.read_hdf(filepath)

In [3]:
import pickle

filepath_tgt = f"{path}/data/market_data_tgt_raw.pickle"
with open(filepath_tgt, 'rb') as handle:
    market_data_tgt = pickle.load(handle)
    
    
filepath_acq = f"{path}/data/market_data_acq_raw.pickle"
with open(filepath_acq, 'rb') as handle:
    market_data_acq = pickle.load(handle)

# Add delisting return for completed deals

In [4]:
market_data_tgt = dsk.add_delisting_return(df, market_data_tgt)

100%|███████████████████████████████████| 12031/12031 [00:07<00:00, 1603.89it/s]


# Filter out equities without trading in `da` and `da`+4

In [5]:
da_4days_after = dsk.get_trading_day_offset(df.da_corrected, 4)

In [6]:
def all_missing_prices_within_time_range(ser):
    """
    ser contains (market data, start_date, end_date), check whether prices within the period are all nans.
    """
    mkt_data = ser.iloc[0]
    start, end = ser.iloc[1], ser.iloc[2]
    if not isinstance(mkt_data, pd.DataFrame):
        return False
    return mkt_data.loc[start:end].prc.isnull().all()

# dsk.apply_func_to_ser_df(pd.concat([market_data_tgt, df.da, da_5days_after], axis=1), )

In [7]:
missing_prices_tgt=dsk.apply_func_to_ser_df(pd.concat([market_data_tgt, df.da_corrected, da_4days_after], axis=1), all_missing_prices_within_time_range)
missing_prices_acq=dsk.apply_func_to_ser_df(pd.concat([market_data_acq, df.da_corrected, da_4days_after], axis=1), all_missing_prices_within_time_range)

100%|███████████████████████████████████| 12031/12031 [00:01<00:00, 9918.30it/s]
100%|██████████████████████████████████| 12031/12031 [00:00<00:00, 15150.52it/s]


In [8]:
no_trading_index_tgt = missing_prices_tgt.index[missing_prices_tgt]
no_trading_index_acq = missing_prices_acq.index[df.stock & missing_prices_acq]

In [9]:
no_trading_index = no_trading_index_acq.union(no_trading_index_tgt)
print(len(no_trading_index))

20


In [10]:
import pickle

with open(f'{path}/data/index_no_trading.pickle', 'wb') as handle:
    pickle.dump(no_trading_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

#  Modify market data

In [11]:
market_data_tgt_corrected = dsk.apply_func_to_ser_df(market_data_tgt, dsk.correct_prc_ret)
market_data_acq_corrected = dsk.apply_func_to_ser_df(market_data_acq, dsk.correct_prc_ret)

100%|███████████████████████████████████| 11138/11138 [00:03<00:00, 3277.23it/s]
100%|█████████████████████████████████████| 6989/6989 [00:02<00:00, 3234.48it/s]


# calculate market cap

In [12]:
market_data_tgt_corrected = dsk.apply_func_to_ser_df(market_data_tgt_corrected, dsk.calculate_mktcap)

100%|███████████████████████████████████| 11138/11138 [00:03<00:00, 3224.23it/s]


# adjust price by culmulative factors

In [13]:
def adjust_price_to_ann_ser(ser):
    """
    ser contains (market_data_df, da)
    """
    if not isinstance(ser.iloc[0], pd.DataFrame):
        return np.nan
    return dsk.adjust_price_to_ann(ser.iloc[0], ser.iloc[1])


market_data_tgt_corrected = dsk.apply_func_to_ser_df(pd.concat([market_data_tgt_corrected, df.da_corrected], axis=1), adjust_price_to_ann_ser)
market_data_acq_corrected = dsk.apply_func_to_ser_df(pd.concat([market_data_acq_corrected, df.da_corrected], axis=1), adjust_price_to_ann_ser)

100%|███████████████████████████████████| 12031/12031 [00:04<00:00, 2993.00it/s]
100%|███████████████████████████████████| 12031/12031 [00:02<00:00, 4658.48it/s]


In [14]:
import pickle

filepath_tgt = f"{path}/data/market_data_tgt_corrected.pickle"
with open(filepath_tgt, 'wb') as handle:
    pickle.dump(market_data_tgt_corrected, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
filepath_acq = f"{path}/data/market_data_acq_corrected.pickle"
with open(filepath_acq, 'wb') as handle:
    pickle.dump(market_data_acq_corrected, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
market_data_tgt_corrected.loc[96354020]

Unnamed: 0_level_0,permno,prc,ret,vol,shrout,cfacpr,cfacshr,mktcap,mktcap_prev,prc_adj_ann,prc_adj_ann_prev
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1983-12-01,42526.0,38.750,0.013072,109900.0,14927.0,1.0,1.0,578421.250,,38.750,
1983-12-02,42526.0,39.000,0.006452,59500.0,15436.0,1.0,1.0,602004.000,578421.250,39.000,38.750
1983-12-05,42526.0,38.250,-0.019231,83200.0,15436.0,1.0,1.0,590427.000,602004.000,38.250,39.000
1983-12-06,42526.0,38.000,-0.001830,7900.0,15436.0,1.0,1.0,586568.000,590427.000,38.000,38.250
1983-12-07,42526.0,36.500,-0.039474,134400.0,15436.0,1.0,1.0,563414.000,586568.000,36.500,38.000
...,...,...,...,...,...,...,...,...,...,...,...
1985-01-21,42526.0,42.250,-0.005882,54800.0,15471.0,1.0,1.0,653649.750,657517.500,42.250,42.500
1985-01-22,42526.0,42.625,0.008876,20200.0,15471.0,1.0,1.0,659451.375,653649.750,42.625,42.250
1985-01-23,42526.0,42.750,0.002933,32500.0,15471.0,1.0,1.0,661385.250,659451.375,42.750,42.625
1985-01-24,42526.0,42.750,0.000000,144200.0,15471.0,1.0,1.0,661385.250,661385.250,42.750,42.750


In [20]:
df.dateval

master_deal_no
95088020             NaT
154109043            NaT
154130043            NaT
154182043            NaT
154200043            NaT
                 ...    
3980719020    2022-09-26
3981301020    2022-09-27
3981993020    2022-09-28
3983892020    2022-09-30
3983877020    2022-09-30
Name: dateval, Length: 12031, dtype: object

In [23]:
df.da_corrected

master_deal_no
95088020      1984-01-06
154109043     1984-01-09
154130043     1984-01-11
154182043     1984-01-18
154200043     1984-01-23
                 ...    
3980719020    2022-09-27
3981301020    2022-09-28
3981993020    2022-09-28
3983892020    2022-09-30
3983877020    2022-09-30
Name: da_corrected, Length: 12031, dtype: object

In [32]:
df[dsk.get_num_trading_days_between(df.dateval, df.da_corrected).lt(-5)]

Unnamed: 0_level_0,statc,one_day_corrected,one_day,aone_day,dao_corrected,dao,da_corrected,da,dateannorig_days,de,...,eqval,tlia,cass,clia,lockup,dae,vest,ann_year,terms,stock
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96354020,C,1984-01-27,1984-01-27,NaT,1984-01-29,1984-01-29,1984-01-30,1984-01-29,0,1984-01-29,...,752.500,,,,No,Yes,No,1984,$43,False
95090020,W,1984-02-13,1984-02-13,NaT,1984-02-14,1984-02-14,1984-02-14,1984-02-14,0,NaT,...,,,,,No,No,No,1984,,False
97229020,W,1984-09-19,1984-09-19,NaT,1984-09-20,1984-09-20,1984-09-21,1984-09-20,0,NaT,...,1000.000,,,,No,No,No,1984,$50 cash,False
21592020,C,1984-10-18,1987-12-17,NaT,1984-10-19,1984-10-19,1984-10-19,1984-10-19,0,1988-11-10,...,78.931,8.1,52.8,7.7,Yes,No,No,1984,$10 cash,False
125287020,C,1984-11-02,1984-11-02,NaT,1984-11-05,1984-11-05,1984-11-05,1984-11-05,0,1984-12-21,...,702.900,,,,Yes,No,No,1984,$71 cash,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3890678020,C,2022-03-18,2022-03-18,NaT,2022-03-20,2022-03-20,2022-03-21,2022-03-20,0,2022-06-22,...,9878.136,529.6,522.0,484.7,No,No,No,2022,An amended $63.5 cash,False
3890790020,P,2022-03-11,2022-03-11,NaT,2022-03-14,2022-03-14,2022-03-21,2022-03-20,6,NaT,...,10059.491,7269.0,1140.0,657.0,No,No,No,2022,A sweetened $28 cash,False
3899034020,P,2022-02-04,2022-02-04,2022-04-04,2022-02-07,2022-02-07,2022-04-06,2022-04-05,57,NaT,...,3717.265,6841.8,1761.9,1737.1,No,No,Yes,2022,A thrice sweetened estimated value $34.15 cash,False
3917881020,C,2022-05-09,2022-05-09,2022-05-09,2022-05-10,2022-05-10,2022-05-10,2022-05-10,0,2022-10-03,...,21356.397,4252.9,257.5,240.5,No,No,No,2022,An amended 0.475 shs com,False
