In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np
import datetime as dt

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.crsp import *
from MA_prediction.mkt_calendar import *

# model-1: Feature engineering

In this notebook we perform the following feature engineering:

- extract the acq price at date `aone_day`, and use it to compute the deal price `pr_extracted`, from the deal terms that we extracted from the consideration string. 
    - Then we check it with the deal price from SDC, and delete those deals with larger than 10% difference.
- modify dates:
    - `da_new`: later of `da` and `dateval` for retained deals.
    - `dao_new`: no earlier than 63 days before `da_new`.
- Extract market price (pre/post):
    - pre-ann/unaffected tgt mkt prices: trailing 20 days average for 1day, 2wk and 4wk before `dao_new`.
    - pre-ann deal price (unaffected acq price): acq prices at 1 day before `da_new`. Used to calculate premiums.
    - post-ann mkt and deal prices: 5days after `da_new`. Used to calculate arbitrage spread and mkt implied probability.


- create new `dao`, `pr1day`, `mv_ratio`
- Change attitude/apub names.
- `psought`
- Premium and spread.
- Change attitude/apub names.


## I/O

- Input:
    - `data/prediction-model/df_filtered.h5`
    - `mkt_data_tgt_processed.pickle`
    - `mkt_data_acq_processed.pickle`

- Output:
    - `df_final.h5`
    
    
## load data

In [3]:
filepath = f"{path}/data/prediction-model/df_filtered.h5"
df = pd.read_hdf(filepath)

mkt_data_tgt_processed, mkt_data_acq_processed = load_mkt_data(path, "processed")

# Deal price extracted from consideration

We extract acq price at `aone_day`, to calculate deal price (for stock deals). Then we check this price with the deal price already given by SDC. Later we would delete the deals with more than 10% difference between the two prices.

In [4]:
ac1day = extract_col_at_date_from_mkt_data_ser(mkt_data_acq_processed, df.aone_day, "prc_adj")

In [5]:
pr_extracted = calculate_deal_price_for_ser(df.cash_term, df.stock_term, df.payment_type, ac1day)
insert_cols(df, "pr", "pr_extracted", pr_extracted)
_, diff_percent = compute_abs_percent_diff(df.pr, df.pr_extracted)

In [6]:
# deal pr differs
num_old = df.retain.sum()

df.retain[diff_percent.gt(.1)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to different deal prices (calculated by us vs from SDC). {num_old} -> {num}"
print(string)

delete 66 deals due to different deal prices (calculated by us vs from SDC). 4041 -> 3975


# Correct dates

In [7]:
# correct `da`
insert_cols(df, "da", "da_new", df.da)
# idx where update of da is needed
idx = df.retain & df.dateval.gt(df.da, fill_value = dt.date(1900, 1, 1))
df.da_new[idx] = df.dateval[idx]

In [8]:
# correct `dao`
insert_cols(df, "dao", "dao_new", df.dao)
df.dao_new = np.maximum(df.dao_new, get_trading_day_offset(df.da_new, -63))

# Mkt prices
## tgt unaffected prices

In [9]:
dict_days = {"4wk": -21, "2wk": -11, '1day': -1}
for name in dict_days:
    dates = get_trading_day_offset(df.dao_new, dict_days[name])
    insert_cols(df, "tprday", f"pr{name}_20_ave", extract_col_at_date_from_mkt_data_ser(mkt_data_tgt_processed, dates, "prc_adj_trail_20_ave"))

In [10]:
df[['pr1day_20_ave', "pr2wk_20_ave", "pr4wk_20_ave"]].dropna()

Unnamed: 0_level_0,pr1day_20_ave,pr2wk_20_ave,pr4wk_20_ave
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
95523020,35.325000,34.650000,34.168750
109787020,61.000000,59.818750,58.475000
23184020,12.634375,12.362500,12.278125
15161020,28.668750,28.000000,26.900000
109800020,48.037500,47.675000,46.343750
...,...,...,...
4016122020,2.754000,2.583500,3.011000
3992461020,41.692000,41.889500,43.308500
4015877020,0.523808,0.702213,0.874915
4017224020,1.109815,1.454565,1.664080


## acq unaffected prices

In [11]:
ac1day_new = extract_col_at_date_from_mkt_data_ser(mkt_data_acq_processed, get_trading_day_offset(df.da_new, -1), "prc_adj")

In [12]:
deal_pr_1day = calculate_deal_price_for_ser(df.cash_term, df.stock_term, df.payment_type, ac1day_new)
insert_cols(df, "pr", "deal_pr_1day", deal_pr_1day)

## premiums

In [13]:
for name in dict_days:
    # pre-ann price
    pre_prc = df[f"pr{name}_20_ave"]
    premium = round(100*df.deal_pr_1day.sub(pre_prc).div(pre_prc), 2)
    insert_cols(df, "ppmday", f"premium{name}", premium)

In [14]:
df[['premium1day', 'premium2wk', 'premium4wk']][df.retain]

Unnamed: 0_level_0,premium1day,premium2wk,premium4wk
master_deal_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
109800020,38.43,39.49,43.49
115481020,13.19,10.62,15.34
19411020,39.63,51.53,64.15
23185020,49.97,54.23,50.30
19699020,20.47,32.07,27.67
...,...,...,...
4009077020,18.69,27.48,37.21
4009317020,35.05,47.40,69.29
4012016020,41.47,62.25,61.60
4014370020,116.23,115.50,130.92


## post prices

In [15]:
dict_post_days = {'day':0, "1daya":1, "4daya":4}
for name in dict_post_days:
    dates = get_trading_day_offset(df.da_new, dict_post_days[name])
    tpr = extract_col_at_date_from_mkt_data_ser(mkt_data_tgt_processed, dates, "prc_adj")
    apr = extract_col_at_date_from_mkt_data_ser(mkt_data_acq_processed, dates, "prc_adj")
    insert_cols(df, f"tprday", f"tpr{name}_new", tpr)
    insert_cols(df, f"aprday", f"apr{name}_new", apr)

In [16]:
deal_pr_4daya = calculate_deal_price_for_ser(df.cash_term, df.stock_term, df.payment_type, df.apr4daya_new)
insert_cols(df, "pr_initial", "deal_pr_4daya", deal_pr_4daya)

In [17]:
arb_spread_4daya = round(100*df.deal_pr_4daya.sub(df.tpr4daya_new).div(df.tpr4daya_new), 2)
insert_cols(df, "ac4wk", "arb_spread_4daya", arb_spread_4daya)

## implied prob

In [18]:
# for name in dict_days:
#     implied_prob = 

# Variable transformation

## `mv_ratio`

In [19]:
insert_cols(df, "amv", "mv_ratio", df.amv.div(df.mv))

## `phda, psought` missing

In [20]:
df.phda = df.phda.fillna(0.)

In [21]:
idx = df.psought.isna()
df.psought[idx] = 100 - df.phda[idx]

## change attitude names

In [22]:
dict_initial_rec = {"Agreed": "Friendly",
                    "Solicited": "Friendly",
                    "Not Appl.": "Friendly",
                    "Neutral": "Friendly",
                    "Unsolic.": "Unfriendly",
                    "Hostile": "Unfriendly"}
insert_cols(df, "initial_rec", "initial_rec_new", df.initial_rec.replace(dict_initial_rec))
df.initial_rec_new[df.retain].value_counts()

Friendly      3765
Unfriendly     208
Name: initial_rec_new, dtype: int64

In [23]:
dict_att = {'Neutral': 'Friendly', 
            'Not Appl.':'Friendly', 
            'Unsolic.': 'Unfriendly', 
            'Hostile':'Unfriendly'}
insert_cols(df, "att", "att_new", df.att.replace(dict_att))
df.att_new[df.retain].value_counts()

idx = df.initial_rec_new.isna()
df.initial_rec_new[idx] = df.att_new[idx]

Friendly      3783
Unfriendly     192
Name: att_new, dtype: int64

## `apub`

In [24]:
df.apub[df.retain].value_counts()

Public    2845
Priv.      551
Sub.       540
J.V.        24
Inv.        10
Mutual       3
Govt.        2
Name: apub, dtype: int64

In [25]:
dict_apub = {
    "Priv.": 'Private',
    'Sub.': 'Private', 
    'J.V.': 'Private', 
    'Inv.': 'Private', 
    'Mutual': 'Private', 
    'Govt.': 'Private',
}
insert_cols(df, "apub", 'apub_new', df.apub.replace(dict_apub))
df.apub_new[df.retain].value_counts()

Public     2845
Private    1130
Name: apub_new, dtype: int64

## financials

In [26]:
df['curr_ratio'] = df.clia.div(df.cass)
df['leverage'] = df.tlia.div(df.eqval)

# Save

In [27]:
filepath = f"{path}/data/prediction-model/df_feature_engineer.h5"
df.to_hdf(filepath, key='df', mode='w')

In [39]:
df[df.retain].to_csv(filepath.replace("h5", "csv"))

## Create market variables

In [28]:
# def calculate_deal_price(cash_term, stock_term, stock, acq_prc):
#     """
#     calculate the deal price from terms
#     """
#     if stock:
#         return cash_term + stock_term * acq_prc
#     return cash_term + stock_term

# def calculate_deal_price_ser(ser):
#     return calculate_deal_price(*ser)

# df['pr_4days_after_da'] = apply_func_to_ser_df(df[['cash_term', 'stock_term', 'stock', 'apr4days_after_da']], calculate_deal_price_ser)

In [29]:
# import wrds
# db = wrds.Connection()

In [30]:
# market_returns = db.raw_sql("""select caldt, vwretd from crsp_m_indexes.msix""")
# market_returns = dsk.to_monthly_period_index(market_returns, 'caldt')
# market_returns = market_returns.rolling(12).sum().shift()

In [31]:
# ann_yr_month_period = df.da_corrected.map(lambda x: pd.Period(f"{x.year}-{x.month}", 'M'))

In [32]:
# df['mkt_ret_prev_yr'] = market_returns.loc[list(ann_yr_month_period)].values

## Industries

In [33]:
# df.ttf_macro_desc = df.ttf_macro_desc.str.replace(' ', '_')
# df.atf_macro_desc = df.atf_macro_desc.str.replace(' ', '_')

In [34]:
# industries = pd.get_dummies(df.ttf_macro_desc)
# industries.columns = list(map(lambda x: 'industry_' + x, industries.columns))

In [35]:
# hist_fail_rate = pd.DataFrame(np.nan, columns=['fail_rate'],
#                               index=pd.period_range('1993-01', '2020-12', freq='M'))
# df_c = df[~df['compete_statc_code'].isin([2, 3, 4])].copy()
# df_c['statc'].replace({'P':1, 'W':1, 'C':0}, inplace=True)
# for i in hist_fail_rate.index:
#     sub = np.logical_and(df['ann_yr_mon']>=i-24, df['ann_yr_mon']<i)
#     hist_fail_rate.loc[i] = df_c.loc[sub, 'statc'].mean()
# hist_fail_rate

In [36]:
# hist_industry_fail_rate = pd.DataFrame(np.nan, columns=df['ttf_macro_desc'].unique(),
#                               index=pd.period_range('1993-01', '2020-12', freq='M'))
# for i in hist_industry_fail_rate.index:
#     for industry in hist_industry_fail_rate.columns:
#         sub = np.logical_and(df['ann_yr_mon']>=i-24, df['ann_yr_mon']<i)
#         sub = np.logical_and(sub, df['ttf_macro_desc']==industry)
#         if sub.sum() == 0:
#             hist_industry_fail_rate.loc[i, industry] = 0#hist_fail_rate.loc[i].values[0]
#         else:
#             hist_industry_fail_rate.loc[i, industry] = df_c.loc[sub, 'statc'].mean()
# hist_industry_fail_rate

In [37]:
# df.loc[index, 'fail_rate_2y'] = round(hist_fail_rate.loc[df.loc[index, 'ann_yr_mon']]*100, 2).squeeze().values

In [38]:
# for i in index:
#     df.loc[i, 'fail_rate_ind_2y'] = \
#     round(hist_industry_fail_rate.loc[df.loc[i, 'ann_yr_mon'], df.loc[i, 'ttf_macro_desc']]*100, 2)

# #df.loc[index, 'fail_rate_ind_2y'] = round(hist_fail_rate.loc[df.loc[index, 'ann_yr_mon']]*100, 2).squeeze().values