In [1]:
%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import pandas as pd
import numpy as np

pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from MA_prediction.utils import *
from MA_prediction.preprocessing import *
from MA_prediction.crsp import *
from MA_prediction.mkt_calendar import *

# Data Filters
Previously we deleted the following deals

- not applicable to our research (detected manually).
- Price is missing.
- Deal consideration is missing. 


Now we delete the following deals:

- lack of accurate information:
    - unsuccessful permno match for the tgt, and the acq in a stock deal. 
    - completed:
        - delisting code not due to MA.
        - delisting return is missing (complex payment)
        - difference between effective date and delisting date is too large (> 40 trading days)
    - `dae` is Yes, or announcement data is estimated.

- inapplicable to our research:
    - small deals: deal value adjusted by inflation <= 200m.
    - duration too short/long: fewer than 5 days, and longer than 2 yrs.
    <!-- - low price -->
- too complex for the model to handle:
    - unsuccessfull parsing of deal terms
    - amendment
    - competition
    - `dateval` too far away from `da` (>5 trading days)

## I/O

- input:
    - `/data/prediction-model/df_feature_engineer.h5`.
    - `/data/reference/filters_basic.txt`

- output:
    - `/data/prediction-model/df_filtered.h5` and `csv`.
    - `/data/reference/filters.txt`

## load data

In [3]:
# dataset
filepath = f"{path}/data/processed/df_processed.h5"
df = pd.read_hdf(filepath)

In [4]:
# already applied filters
textpath_old = f"{path}/data/reference/filters_basic.txt"
prev_filters = open(textpath_old, "r").read()
print(prev_filters)
# 
textpath = f"{path}/data/reference/filters.txt"
textfile = open(textpath, "w")
_=textfile.write(prev_filters)

delete 1633 deals due to missing pricing information. 12082 -> 10449
delete 3 deals due to missing consideration. 10449 -> 10446



# Filtering

In [5]:
df['retain'] = True
num = len(df)

## tgt permno is matched successfully

In [6]:
# no tgt permno
num_old = num

df.retain[df.tpermno.isna()] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to failure of matching target in CRSP. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 832 deals due to failure of matching target in CRSP. 10446 -> 9614


## acq permno is matched successfully, for stock deals

In [7]:
# no acq permno
num_old = num

df.retain[df.apermno.isna()&df.stock] = False

num = df.retain.sum()

string = f"delete {num_old-num} stock deals due to failure of matching acquiror in CRSP. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 236 stock deals due to failure of matching acquiror in CRSP. 9614 -> 9378


## delisting

### delisting reason is MA

In [8]:
# delisted not due to MA
num_old = num

complete = df.statc.eq('C')

df.retain[complete & ~df.delist_code.between(200, 300, inclusive='left')] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to non-M&A-related delisting code for completed deals. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 213 deals due to non-M&A-related delisting code for completed deals. 9378 -> 9165


### delisting return is not missing

In [9]:
# from CRSP, the deal involves payment that is difficult to price
num_old = num

df.retain[complete & df.delist_return.isna()] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to missing delist price/return (complex payment) for completed deals. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 46 deals due to missing delist price/return (complex payment) for completed deals. 9165 -> 9119


### delisting date and effective date don't differ too much

In [10]:
# delisted date differs too much with effective date
num_old = num

days_between_de_delist_date = get_num_trading_days_between(df.de, df.delist_date)
df.retain[complete & abs(days_between_de_delist_date).gt(40)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to mismatch between delisting date in CRSP and effective date in SDC. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 129 deals due to mismatch between delisting date in CRSP and effective date in SDC. 9119 -> 8990


## announcement date is estimated

In [11]:
# estimated announcement date
num_old = num

df.retain[df.dae.eq('Yes')] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to estimated announcement date. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 50 deals due to estimated announcement date. 8990 -> 8940


## deals inapplicable to our research

In [12]:
num_old = num

df.retain[get_delete_index(df)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals inapplicable to our research. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 3 deals inapplicable to our research. 8940 -> 8937


## failure of extracting deal consideration

In [13]:
# cannot extract deal consideration
num_old = num

payment_type_lst = ['Cash', 'Common Stock', 'Cash and Common Stock', 
                     'Common Stock, fixed dollar', 'Cash and Common Stock, fixed dollar']
df.retain[~df.payment_type.isin(payment_type_lst)]=False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to the failure of extracting deal terms from consideration string. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 1830 deals due to the failure of extracting deal terms from consideration string. 8937 -> 7107


## deal value (adjusted by CPI) smaller than \$200m

In [14]:
# small deals
num_old = num

df.retain[df.val_adj_by_cpi.lt(200.)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to deal value adjusted by inflation smaller than $200m. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 2160 deals due to deal value adjusted by inflation smaller than $200m. 7107 -> 4947


## duration

In [15]:
num_old = num

df.retain[df.duration.lt(10)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to duration shorter than 10 trading days. {num_old} -> {num}"
print_and_save_string(string, textfile)

num_old = num

df.retain[df.duration.gt(504)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to duration longer than 2 years. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 89 deals due to duration shorter than 10 trading days. 4947 -> 4858
delete 9 deals due to duration longer than 2 years. 4858 -> 4849


## deal amendment

In [16]:
# amended deals
num_old = num

df.retain[df.amend.eq(1)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to deal amendment. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 316 deals due to deal amendment. 4849 -> 4533


## deal competition

In [17]:
# involved in deal competition
num_old = num

df.retain[df.compete_group_no.notna()] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to deal competition. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 314 deals due to deal competition. 4533 -> 4219


## small target price

In [18]:
# small price
num_old = num

df.retain[df.pr.lt(3.)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to deal price less than $3. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 54 deals due to deal price less than $3. 4219 -> 4165


## `da` and `dateval`

In [19]:
# small price
num_old = num

num_days = get_num_trading_days_between(df.da, df.dateval)
df.retain[abs(num_days).gt(6.)] = False

num = df.retain.sum()

string = f"delete {num_old-num} deals due to valuation date too far from announcement date. {num_old} -> {num}"
print_and_save_string(string, textfile)

delete 124 deals due to valuation date too far from announcement date. 4165 -> 4041


# save

In [20]:
textfile.close()

In [21]:
filepath = f"{path}/data/prediction-model/df_filtered.h5"
df.to_hdf(filepath, key='df', mode='w')