In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

%load_ext autoreload
%autoreload 2
import sys, os
from os.path import expanduser
## actions required!!!!!!!!!!!!!!!!!!!! change your folder path 
path = "~/Documents/G3/MA-prediction"
path = expanduser(path)
sys.path.append(path)

import data_science_MA_kit as dsk
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import re
import wrds

pd.options.mode.chained_assignment = None

# Data preprocessing 3: Data Filters

Apply filters:

- data error detected manually.
- deal price, initial deal price >= \$1. 
- target stock price 1 day, 1 week and 4 weeks before >= \$1.
- deal value adjusted by inflation >= \$200 m.
- date announced is not estimated.
- deal cash and stock terms parsing successful.
- target and acquiror matched in CRSP.
- For completed deals, delisting code in CRSP is M&A related.
- For completed deals, effective date in SDC and target delisting date in CRSP is close. 
- duration no fewer than 5 trading days.


Results of filtering:

- delete 3 deals due to data error. 9854 -> 9851
- delete 1503 deals due to deal price smaller than \$1. 9851 -> 8348
- delete 490 deals due to target price smaller than \$1. 8348 -> 7858
- delete 2473 deals due to deal value adjusted by inflation smaller than $200 m. 7858 -> 5385
- delete 0 deals due to estimated announcement date. 5385 -> 5385
- delete 1117 deals due to failure of extracting cash and stock terms. 5385 -> 4268
- delete 31 deals due to failure of matching target in CRSP. 4268 -> 4237
- delete 33 deals due to failure of matching acquiror in CRSP for stock deals. 4237 -> 4204
- delete 21 deals due to non-M&A-related delisting code for completed deals. 4204 -> 4183
- delete 10 deals due to mismatch between delisting date in CRSP and effective date in SDC. 4183 -> 4173
- delete 39 deals due to duration shorter than 5 trading days. 4173 -> 4134

## load data

In [2]:
pathfile = f"{path}/data/df_variable_transform.h5"
df = pd.read_hdf(pathfile)

# Filters

In [3]:
df['retain'] = True

In [4]:
num_orig = df.retain.sum()
index_del = dsk.get_delete_index(df)
df.retain[index_del] = False
num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to data error. {num_orig} -> {num_new}")

delete 3 deals due to data error. 9854 -> 9851


In [5]:
num_orig = df.retain.sum()
df.retain[df.pr.lt(1., fill_value = 0.) | df.pr_initial.lt(1., fill_value = 0.)] = False
num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to deal price smaller than $1. {num_orig} -> {num_new}")

delete 1503 deals due to deal price smaller than $1. 9851 -> 8348


In [6]:
num_orig = df.retain.sum()
df.retain[df.pr1day.lt(1., fill_value=0.) | df.pr1wk.lt(1., fill_value=0.) | df.pr4wk.lt(1., fill_value=0.)] = False
num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to target price smaller than $1. {num_orig} -> {num_new}")

delete 490 deals due to target price smaller than $1. 8348 -> 7858


In [7]:
num_orig = df.retain.sum()
df.retain[df.val_adj_by_cpi.lt(200., fill_value=0.)] = False
num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to deal value adjusted by inflation smaller than $200 m. {num_orig} -> {num_new}")

delete 2473 deals due to deal value adjusted by inflation smaller than $200 m. 7858 -> 5385


In [8]:
num_orig = df.retain.sum()
df.retain[df.da.eq('Yes')] = False
num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to estimated announcement date. {num_orig} -> {num_new}")

delete 0 deals due to estimated announcement date. 5385 -> 5385


In [9]:
num_orig = df.retain.sum()

payment_types_lst = ['Cash', 'Common Stock', 'Cash and Common Stock', 
                     'Common Stock, fixed dollar', 'Cash and Common Stock, fixed dollar']
df.retain[~df.payment_type.isin(payment_types_lst)]=False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to failure of extracting cash and stock terms. {num_orig} -> {num_new}")

delete 1117 deals due to failure of extracting cash and stock terms. 5385 -> 4268


In [10]:
num_orig = df.retain.sum()

df.retain[df.tpermno.isna()] = False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to failure of matching target in CRSP. {num_orig} -> {num_new}")

delete 31 deals due to failure of matching target in CRSP. 4268 -> 4237


In [11]:
num_orig = df.retain.sum()

df.retain[df.payment_type.isin(['Common Stock', 'Cash and Common Stock']) & df.apermno.isna()] = False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to failure of matching acquiror in CRSP for stock deals. {num_orig} -> {num_new}")

delete 33 deals due to failure of matching acquiror in CRSP for stock deals. 4237 -> 4204


In [12]:
num_orig = df.retain.sum()

df.retain[df.statc.eq('C') & ~df.delist_code.between(200, 300, inclusive='left')] = False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to non-M&A-related delisting code for completed deals. {num_orig} -> {num_new}")

delete 21 deals due to non-M&A-related delisting code for completed deals. 4204 -> 4183


In [13]:
num_orig = df.retain.sum()

days_between_de_delist_date = dsk.trading_days_between(df.de, df.delist_date)
df.retain[df.statc.eq('C')&abs(days_between_de_delist_date).gt(252)] = False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to mismatch between delisting date in CRSP and effective date in SDC. {num_orig} -> {num_new}")

delete 10 deals due to mismatch between delisting date in CRSP and effective date in SDC. 4183 -> 4173


In [14]:
num_orig = df.retain.sum()

df.retain[df.duration.lt(5, fill_value=0.)] = False

num_new = df.retain.sum()
print(f"delete {num_orig-num_new} deals due to duration shorter than 5 trading days. {num_orig} -> {num_new}")

delete 39 deals due to duration shorter than 5 trading days. 4173 -> 4134
