In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
import wrds

pd.options.mode.chained_assignment = None

In [2]:
import wrds
db = wrds.Connection()

Enter your WRDS username [yizhan]:olivershu
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
Loading library list...
Done


# Data preprocessing 2: Date Correction

Dates are very essential in corporate events, during which price volatility is extremely high. As a result, messing up event date by even only one trading day would affect return calculation to some extent. In this notebook we correct the important dates for every deal as follow:

- `one_day`: the previous trading day to `dao`. Correct it to be indeed a trading day.
- `dao`: original announcement date. Used mainly for pulling unaffected price. We don't work on correcting it now.
- `da`: announcement date, defined by us as the next trading day after announcement. Usually this day will see the highest trading volumes. The complexity is that the official announcement on `da` provided in the database can be both on the morning or in the evening, which may affect the current or the next trading day. Thus we correct announcement date to be the one of `da` and `da`+1 trading day that has the higher trading volumes.
- `dr`: resolution date.
    - for completed deals, it is the effective date, defined as the first trading day after the equity's last trading day. We use the `delist_date` provided in CRSP.
    - for withdrawn deals, it is the withdrawal date, defined as the first trading day after announcement of withdrawal. Same as `da`, we correct it to be one of `dw` and `dw` + 1 trading day with the higher trading volumes.

- After correcting the dates, we create the deal duration, which is the number of trading days between `da` and `dr`.

## load data

In [3]:
filepath = f"{path}/data/df_merge_CRSP.h5"
df = pd.read_hdf(filepath)

# correct `one_day`

In [4]:
# some `one_day`s in the database are not trading days
one_day_trading_day = dsk.get_trading_day_offset(df.one_day, 0)
# pick the indices whose one_day is not a trading day
index = df.index[one_day_trading_day.ne(df.one_day)]
# adjust them to the previous trading day
df.one_day[index] = dsk.get_trading_day_offset(df.one_day[index], -1)

# correct `da`
we define `da` to be the next trading day after announcement, which should have the highest trading volumes.

In [5]:
# the first trading day after `da`
da = dsk.get_trading_day_offset(df.da, 0)
# the second trading day after `da`
da_plus_one_day = dsk.get_trading_day_offset(df.da, 1)
# pull trading volume data from CRSP. take 2-4 mins
volumes_announce = dsk.apply_func_to_ser(pd.concat([df.tpermno, da, da_plus_one_day], axis=1),
                                dsk.get_stock_value_date_range_CRSP,
                                'vol',
                                return_as_df=True,
                                columns=['day', 'day_plus_one'],
                                db=db)

100%|███████████████████████████████████████| 9854/9854 [03:33<00:00, 46.09it/s]


In [6]:
dsk.insert_cols(df, 'da', 'da_corrected', da)
# 
index = volumes_announce.day_plus_one.gt(volumes_announce.day, fill_value=0)
df.da_corrected[index] = da_plus_one_day[index]

# create `dr`
- for completed deals, `dr` is the delisting date, the next trading day after last trade date
- for withdrawn deals, `dr` is the next trading date after announcement of withdrawal, which should also have high trading volumes.

In [7]:
dsk.insert_cols(df, 'definitive_agt', 'dr', np.nan)

# for completed deals, fill date of resolution by the delisting date
df.dr[df.statc.eq('C')] = df.delist_date[df.statc.eq('C')]

In [8]:
# for withdrawn deals, fill date of resolution to one of dw and dw+1 with the higher trading volumes
df_w = df.loc[df.statc.eq('W')]
# 
dw = dsk.get_trading_day_offset(df_w.dw, 0)
dw_plus_one_day = dsk.get_trading_day_offset(df_w.dw, 1)

volumes_withdraw = dsk.apply_func_to_ser(pd.concat([df_w.tpermno, dw, dw_plus_one_day], axis=1),
                                dsk.get_stock_value_date_range_CRSP,
                                'vol',
                                return_as_df=True,
                                columns=['day', 'day_plus_one'],
                                db=db)

100%|███████████████████████████████████████| 1895/1895 [00:29<00:00, 63.49it/s]


In [9]:
# 
df.dr[df.statc.eq('W')] = dw
# 
index = df_w.index[volumes_withdraw.day_plus_one.gt(volumes_withdraw.day, fill_value=0)]
df.dr[index] = dw_plus_one_day[index]

# create `duration`
Duration is the number of trading days between `da_corrected` and `dr`.

In [10]:
dsk.insert_cols(df, 'definitive_agt', 'duration', dsk.trading_days_between(df.da_corrected, df.dr))

# Save results

In [11]:
filepath = f"{path}/data/dates_corrected.h5"
df[['one_day', 'da_corrected', 'dr', 'duration']].to_hdf(filepath, key='dates', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['one_day', 'da_corrected', 'dr'], dtype='object')]

  df[['one_day', 'da_corrected', 'dr', 'duration']].to_hdf(filepath, key='dates', mode='w')


In [12]:
filepath = f"{path}/data/df_dates_corrected.h5"
df.to_hdf(filepath, key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['statc', 'one_day', 'aone_day', 'dao', 'da_corrected', 'da', 'de',
       'dateeffexp', 'dw', 'dr', 'definitive_agt', 'da_date', 'dateval',
       'dcom', 'dcomeff', 'last_trade_date', 'delist_date', 'att',
       'attitude_change_yn', 'initial_rec', 'tn_CRSP', 'tn', 'ttic_CRSP',
       'ttic', 'tcu_CRSP', 'tcu', 'texch', 'an_CRSP', 'an', 'apub',
       'atic_CRSP', 'atic', 'acu_CRSP', 'acu', 'anatc', 'aexch', 'cross',
       'ttf_macro_desc', 'ttf_mid_desc', 'atf_macro_desc', 'atf_mid_desc',
       'valamend', 'consid_struct_desc', 'consid', 'consido', 'consids', 'cha',
       'tend', 'term', 'synop', 'hdate', 'hosthprice', 'hval', 'hevent',
       'hosthval', 'competecode', 'competeval', 'lbo', 'afinancial', 'alp',
       'aspv', 'awk', 'hedge_fund_involv_yn', 'collar', 'lockup', 'dae',
       'vest'],
      dtype='object')]
