In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
import wrds

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]:olivershu
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
Loading library list...
Done


# Data preprocessing 2: Date Correction

Dates are very essential in corporate events, during which volumes and volatility are extremely high. As a result, messing up event date by even only one trading day would affect return calculation to some extent. We define the dates as below:

- `dao`: max(min(`dao`, `da`), 126 trading days before `da`) 
- `da`: announcement date, the next trading day after announcement. Usually this day will see the highest trading volumes. The complexity is that the official announcement on `da` provided in the database can be both on the morning or in the evening, which may affect the current or the next trading day. Thus we correct announcement date to be the one of `da` and `da`+1 trading day that has the higher trading volumes.
- `dr`: resolution date.
    - for completed deals, it is the effective date, defined as the first trading day after the equity's last trading day. We use the `delist_date` provided in CRSP. It not available, we use just `de`.
    - for withdrawn deals, it is the withdrawal date, defined as the first trading day after announcement of withdrawal. Same as `da`, we correct it to be one of `dw` and `dw` + 1 trading day with the higher trading volumes.

- After correcting the dates, we create the deal duration, which is the number of trading days between `da_corrected` and `dr_corrected`.



In this notebook we:

- correct `dao`, `one_day`: `dao` cannot be 126 trading days before announcement.
- correct `da`: one of `da` and `da`+1 trading day that has the higher trading volumes.
- correct `dr`: 
    - `delist_date` or `de` for completed deals.
    - one of `dw` or `dw`+1 trading day that has the higher trading volumes.
- create deal duration: the number of trading days between `da_corrected` and `dr_corrected`.

## I/O

- Input:
    - `df_merge_CRSP.h5`
    
- Output:
    - `df_dates_corrected.h5`
    - `dates_corrected.h5`

## load data

In [3]:
filepath = f"{path}/data/df_merge_CRSP.h5"
df = pd.read_hdf(filepath)

# `dao_corrected` and `one_day_new`

In [4]:
# dao_corrected should be ealier than da
dsk.insert_cols(df, 'dao', 'dao_corrected', np.minimum(df.da, df.dao))
# dao_corrected should be within 6 months before the announcement
df.dao_corrected = np.maximum(df.dao_corrected, dsk.get_trading_day_offset(df.da, -126))

# one_day_new is the previous trading day to dao_new
dsk.insert_cols(df, 'one_day', 'one_day_corrected', dsk.get_trading_day_offset(df.dao_corrected, -1))

# correct original `one_day`

In [5]:
# some `one_day`s in the database are not trading days
one_day_next_trading_day = dsk.get_trading_day_offset(df.one_day, 0)
one_day_prior_trading_day = dsk.get_trading_day_offset(df.one_day, -1)
# pick the indices whose one_day is not a trading day
index_oneday_not_trading_day = df.index[one_day_next_trading_day.ne(df.one_day)]
# adjust them to the previous trading day
df.one_day[index_oneday_not_trading_day] = one_day_prior_trading_day[index_oneday_not_trading_day]

# correct `da`
we define `da` to be the next trading day after announcement, which should have the highest trading volumes.

In [6]:
def get_stock_vol_from_ser_CRSP(ser, db=None):
    """
    ser has three values (permno, start_date, end_date)
    """
    return dsk.get_stock_market_data_daily_CRSP(ser.iloc[0], start_date=ser.iloc[1], end_date=ser.iloc[2], cols=['vol'], db=db)

In [7]:
# the first trading day after `da`
da = dsk.get_trading_day_offset(df.da, 0)
# the second trading day after `da`
da_plus_one_day = dsk.get_trading_day_offset(df.da, 1)
# pull trading volume data from CRSP. take 2-4 mins
df_tpermno_da_da_plus_one = pd.concat([df.tpermno, da, da_plus_one_day], axis=1)
volumes_da_df = dsk.apply_func_to_ser_df(df_tpermno_da_da_plus_one, 
                                          get_stock_vol_from_ser_CRSP, 
                                          return_as_df=True, 
                                          use_new_cols=['vol_da', 'vol_da_plus_one'],
                                                        db=db)

100%|█████████████████████████████████████| 12031/12031 [02:54<00:00, 68.96it/s]


In [8]:
dsk.insert_cols(df, 'da', 'da_corrected', da)
# 
index_da_to_correct = df.index[volumes_da_df.vol_da_plus_one.gt(volumes_da_df.vol_da, fill_value=0)]
print(len(index_da_to_correct))
df.da_corrected[index_da_to_correct] = da_plus_one_day[index_da_to_correct]

3761


# create `dr`
We first create a raw `dr` using SDC dates, i.e. `de` for completed deals and `dw` for withdrawn deals.

Then we create a `dr_corrected` by our method:

- for completed deals, `dr_corrected` is the delisting date, the next trading day after last trade date, or just `de` if delisting date is not available.
- for withdrawn deals, `dr_corrected` is the next trading date after announcement of withdrawal, which should also have high trading volumes.

In [9]:
# create a `dr` from the database
dsk.insert_cols(df, 'definitive_agt', 'dr', np.nan)
#
df.dr[df.statc.eq('C')] = df.de[df.statc.eq('C')]
df.dr[df.statc.eq('W')] = df.dw[df.statc.eq('W')]

In [10]:
# dr corrected by us
dsk.insert_cols(df, 'definitive_agt', 'dr_corrected', np.nan)

# for completed deals, fill date of resolution by the delisting date
df.dr_corrected[df.statc.eq('C')] = df.delist_date[df.statc.eq('C')]
# for completed deals where delisting date is missing, fill it by just `de`
index_na_delist_date = df.index[df.statc.eq('C')&df.delist_date.isna()]
df.dr_corrected[index_na_delist_date] = df.de[index_na_delist_date]

In [11]:
# for withdrawn deals, fill date of resolution to one of dw and dw+1 with the higher trading volumes
# extract withdrawn deals
df_w = df.loc[df.statc.eq('W')]
# 
dw = dsk.get_trading_day_offset(df_w.dw, 0)
dw_plus_one_day = dsk.get_trading_day_offset(df_w.dw, 1)
# 
df_tpermno_dw_dw_plus_one = pd.concat([df_w.tpermno, dw, dw_plus_one_day], axis=1)
volumes_dw_df = dsk.apply_func_to_ser_df(df_tpermno_dw_dw_plus_one, 
                                          get_stock_vol_from_ser_CRSP, 
                                          return_as_df=True, 
                                          use_new_cols=['vol_dw', 'vol_dw_plus_one'],
                                                        db=db)

100%|███████████████████████████████████████| 2891/2891 [00:39<00:00, 72.58it/s]


In [12]:
# default is `dw`
df.dr_corrected[df.statc.eq('W')] = dw
# 
index_dw_to_correct = df_w.index[volumes_dw_df.vol_dw_plus_one.gt(volumes_dw_df.vol_dw, fill_value=0)]
print(len(index_dw_to_correct))
df.dr_corrected[index_dw_to_correct] = dw_plus_one_day[index_dw_to_correct]

1011


# create `duration`
Duration is the number of trading days between `da_corrected` and `dr`.

In [13]:
dsk.insert_cols(df, 'definitive_agt', 'duration', dsk.get_num_trading_days_between(df.da_corrected, df.dr_corrected))

# Save results

In [14]:
filepath = f"{path}/data/dates_corrected.h5"
df[['one_day', 'da_corrected', 'dr', 'duration']].to_hdf(filepath, key='dates', mode='w')

In [15]:
filepath = f"{path}/data/df_dates_corrected.h5"
df.to_hdf(filepath, key='df', mode='w')