## Import Statements

In [1]:
# Import sciPy as a statistical package
from scipy import stats

# Import pickle to read the data
import pickle
# Import pandas for data managment
import pandas as pd
# Import numpy for supportive calculations
import numpy as np

# Import the matplotlib python package for plotting
import matplotlib.pyplot as plt
# Set the plot display format to 'retina' for more detailed graphs
%config InlineBackend.figure_format='retina'
%matplotlib inline

from matplotlib import colors

from sklearn.model_selection import train_test_split


## Data Cleaning

In [2]:
# Open the pickle file
with open('insider_data.pkl', 'rb') as f:
    # Workaround to read pickle file to do bug in Pandas 1.3.0
    # (The bug is fixed in version 1.3.1)
    data = pd.read_pickle(f)
    
# Define the value of the trade ($ value)
data['value_of_trade'] = data['shares_transacted']*data['executionPrice']


In [3]:
data.sector.unique()

array(['Energy', 'Consumer Staples', 'Utilities', 'Materials',
       'Financials', 'Health Care', 'Consumer Discretionary',
       'Information Technology', 'Industrials', 'Communication Services',
       None, 'Real Estate'], dtype=object)

In [4]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,companyid,tic,ownerObjectId,insiderTradeId,transaction_type,shares_transacted,executionPrice,percentOfSharesTraded,pct_change_in_shares_held_by_insider,n_shares_owned_post_transaction,...,yield,cap,marketcap,marketcap_rank,beta_tv_median,sector,group,ind,cohort_name,value_of_trade
datadate,issueid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003-12-31,00851201,296104,PETD,615007,3002586,Open Market Disposition,-7899.0,24.645948,-0.050542,-28.312842,20000.0,...,YIELD_REG,CAP_MICRO,370.393862,2089.0,0.620775,Energy,Energy,"Oil, Gas & Consumable Fuels",energy_commodities,-1.946783e+05
2003-12-31,03065101,341990,DAR,9993943,1944208,Open Market Acquisition,185000.0,2.694643,0.292230,0.890176,20967387.0,...,YIELD_REG,CAP_MICRO,174.725233,2709.0,0.955000,Consumer Staples,"Food, Beverage & Tobacco",Food Products,,4.985090e+05
2004-01-05,00797401,292092,NI,753642,3311983,Open Market Disposition,-54358.0,21.765000,-0.020710,-11.179069,431890.0,...,YIELD_HIGH,CAP_MID,5619.372374,374.0,0.610000,Utilities,Utilities,Gas Utilities,cfar,-1.183102e+06
2004-01-05,00869201,297620,PCH,533349,3337544,Open Market Derivative Acquisition,148.6,0.000000,,0.896088,16732.0,...,YIELD_HIGH,CAP_SMALL,1043.612222,1248.0,1.056147,Materials,Materials,Paper & Forest Products,high_yield_junk,0.000000e+00
2004-01-06,00797401,292092,NI,753642,741180,Open Market Disposition,-4063.0,21.765000,-0.001548,-0.912238,441325.0,...,YIELD_HIGH,CAP_MID,5572.128701,375.0,0.595000,Utilities,Utilities,Gas Utilities,cfar,-8.843120e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-02,17710901,13413134,CAI,204975497,7291252,Open Market Disposition,-1947.0,25.900000,,,,...,YIELD_REG,CAP_MICRO,958.281260,2195.0,0.413640,Industrials,Capital Goods,Trading Companies & Distributors,industrials,-5.042730e+04
2021-07-02,17710901,13413134,CAI,204975497,7291253,Open Market Disposition,-2937.0,25.899475,,,,...,YIELD_REG,CAP_MICRO,958.281260,2195.0,0.413640,Industrials,Capital Goods,Trading Companies & Distributors,industrials,-7.606676e+04
2021-07-02,17801501,34125194,LULU,412691818,7293539,Open Market Disposition,-4000.0,366.755068,-0.003201,-65.445026,2112.0,...,YIELD_REG,CAP_LARGE,47958.312705,182.0,1.084751,Consumer Discretionary,Consumer Durables & Apparel,"Textiles, Apparel & Luxury Goods",hyper_growth,-1.467020e+06
2021-07-02,18336601,13103664,H,112268879,7291440,Open Market Disposition,-93750.0,77.630000,-0.227822,-17.774836,433681.0,...,YIELD_REG,CAP_MID,8025.994139,776.0,1.326529,Consumer Discretionary,Consumer Services,"Hotels, Restaurants & Leisure",other_junk,-7.277812e+06


In [5]:
# Make a copy of the data for testing
df = data.copy()

In [6]:
# Add one-hot encoding for sectors
sector_values = list(data.sector.unique())
sector_values.remove(None)

df[sector_values] = pd.get_dummies(data['sector'])

In [7]:
# Add one-hot encoding for transaction_type
transaction_values = list(data.transaction_type.unique())

df[transaction_values] = pd.get_dummies(data['transaction_type'])

In [8]:
# Add one-hot encoding for cohort
cohort_names = list(df.cohort_name.unique())
[cohort_names.remove(x) for x in cohort_names if not type(x) == str]

df[cohort_names] = pd.get_dummies(df['cohort_name'])

In [9]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,companyid,tic,ownerObjectId,insiderTradeId,transaction_type,shares_transacted,executionPrice,percentOfSharesTraded,pct_change_in_shares_held_by_insider,n_shares_owned_post_transaction,...,financials,tmt,high_yield,staples,transports,industrials,aerospace,healthcare,pharma_biotech,utilities
datadate,issueid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2003-12-31,00851201,296104,PETD,615007,3002586,Open Market Disposition,-7899.0,24.645948,-0.050542,-28.312842,20000.0,...,0,0,0,0,0,0,0,0,0,0
2003-12-31,03065101,341990,DAR,9993943,1944208,Open Market Acquisition,185000.0,2.694643,0.292230,0.890176,20967387.0,...,0,0,0,0,0,0,0,0,0,0
2004-01-05,00797401,292092,NI,753642,3311983,Open Market Disposition,-54358.0,21.765000,-0.020710,-11.179069,431890.0,...,0,0,0,0,0,0,0,0,0,0
2004-01-05,00869201,297620,PCH,533349,3337544,Open Market Derivative Acquisition,148.6,0.000000,,0.896088,16732.0,...,0,0,0,0,0,0,0,0,0,0
2004-01-06,00797401,292092,NI,753642,741180,Open Market Disposition,-4063.0,21.765000,-0.001548,-0.912238,441325.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-02,17710901,13413134,CAI,204975497,7291252,Open Market Disposition,-1947.0,25.900000,,,,...,0,1,0,0,0,0,0,0,0,0
2021-07-02,17710901,13413134,CAI,204975497,7291253,Open Market Disposition,-2937.0,25.899475,,,,...,0,1,0,0,0,0,0,0,0,0
2021-07-02,17801501,34125194,LULU,412691818,7293539,Open Market Disposition,-4000.0,366.755068,-0.003201,-65.445026,2112.0,...,0,0,0,0,0,0,0,0,0,0
2021-07-02,18336601,13103664,H,112268879,7291440,Open Market Disposition,-93750.0,77.630000,-0.227822,-17.774836,433681.0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
# Define the list of variables
variables = ['percentOfSharesTraded', 'pct_change_in_shares_held_by_insider',
            'n_shares_owned_post_transaction', 'beta_tv_median', 'marketcap'] + \
            sector_values + transaction_values + cohort_names

In [11]:
objective_var = 'fwd_21d_return'

In [12]:
# Drop na values
df.dropna(subset=variables+[objective_var], how='any', inplace=True)

# Display the length of the data
len(df)

409658

In [13]:
def sign_classifier(percent):
    
    # Classify the forward 21 day return for each trade as positive or negative
    # Returns: 0 for negative and 1 for positive percent increase in stock
    
    if percent >= 0:
        return 1
    else:
        return 0

In [14]:
df[objective_var+'_classified'] = df[objective_var].apply(sign_classifier)
df['fwd_21d_return_classified'].value_counts()

0    207527
1    202131
Name: fwd_21d_return_classified, dtype: int64

In [15]:
def sample_df_equally_by_group(df, column, n):
    # your answer goes here
    df0 = df[df[column]==0].sample(n=n)
    df1 = df[df[column]==1].sample(n=n)
    df2 = [df0, df1]
    final = pd.concat(df2)
    
    return final

In [16]:
final_data = sample_df_equally_by_group(df, column="fwd_21d_return_classified", n=100000)
final_data.shape

(200000, 84)

In [17]:
# Define the y
y = final_data['fwd_21d_return_classified']
y

datadate    issueid 
2015-11-09  00853001    0
2015-12-17  17913201    0
2004-11-03  00225501    0
2014-03-17  03047301    0
2013-12-13  16392001    0
                       ..
2008-07-16  01695501    1
2012-01-26  02380901    1
2007-12-12  00579201    1
2007-03-15  15685701    1
2011-05-17  14391001    1
Name: fwd_21d_return_classified, Length: 200000, dtype: int64

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

estimators = {
    'Logistic Regression': LogisticRegression(),
    'k-Nearest Neighbor': KNeighborsClassifier(),
    'MLP Classifier': MLPClassifier(),
    'Decision Tree': DecisionTreeClassifier(criterion="entropy")
    }

In [19]:
def classifiers_test(features, target, estimators, variables):
    
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    
    X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=3000)
    
    for estimator_name, estimator_object, in estimators.items():
        
        model = estimator_object.fit(X_train[variables], y_train)
        
        yhat = model.predict(X_test[variables])
        acc = accuracy_score(y_test, yhat)
        
        print(estimator_name + '\n\t' + f'Classification accuracy on the test data: {acc:.2%}' + '\n')
        
        long_or_short = pd.Series(yhat).apply(lambda x: 1.0 if x == 1 else -1.0)
                        
        returns = long_or_short*X_test['fwd_21d_return'].values
        
        print('\t\tMean Return: {:.4f}, Standard Deviation: {:.4f}'.format(np.mean(returns), np.std(returns)))
        
        if estimator_name == 'Decision Tree':
            return long_or_short, X_test['fwd_21d_return']
        

In [20]:
trade, returns = classifiers_test(final_data, y, estimators, variables)

Logistic Regression
	Classification accuracy on the test data: 50.22%

		Mean Return: -0.0052, Standard Deviation: 0.1165




k-Nearest Neighbor
	Classification accuracy on the test data: 53.20%

		Mean Return: 0.0047, Standard Deviation: 0.1166
MLP Classifier
	Classification accuracy on the test data: 49.66%

		Mean Return: 0.0051, Standard Deviation: 0.1165
Decision Tree
	Classification accuracy on the test data: 60.05%

		Mean Return: 0.0136, Standard Deviation: 0.1159
