# Imports 

In [3]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import  accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier


# Data Retrieval

In [4]:
df_private = pd.read_csv("data/private_financials.csv").iloc[:, 1:]
df_public = pd.read_csv("data/public_financials.csv").iloc[:, 1:]

In [5]:
df_private = df_private[['UNITID','F2CORREV','F2TUFEPC', 'F2GVGCPC', 'F2PGGCPC','F2INVRPC', 
'F2OTRVPC', 'F2TUFEFT','F2GVGCFT','F2PGGCFT','F2INVRFT','F2OTRVFT','F2COREXP',
'F2INSTPC','F2RSRCPC','F2PBSVPC','F2ACSPPC','F2STSVPC','F2INSUPC','F2OTEXPC',
'F2INSTFT','F2RSRCFT','F2PBSVFT','F2ACSPFT','F2STSVFT','F2INSUFT','F2OTEXFT',
'F2SAFBPC','F2SALRPC','F2ENDMFT','F2EQUITR','GBA6RTBK', 'year']]
df_public = df_public[['UNITID','F1CORREV','F1TUFEPC','F1GVGCPC','F1PGGCPC','F1INVRPC',
'F1OTRVPC','F1TUFEFT','F1STAPFT','F1LCAPFT','F1GVGCFT','F1PGGCFT','F1INVRFT',
'F1OTRVFT','F1COREXP','F1INSTPC','F1RSRCPC','F1PBSVPC','F1ACSPPC','F1STSVPC',
'F1INSUPC','F1OTEXPC','F1INSTFT','F1RSRCFT','F1PBSVFT','F1ACSPFT','F1STSVFT',
'F1INSUFT','F1OTEXFT','F1SAFBPC','F1SALRPC','F1ENDMFT','F1EQUITR','GBA6RTBK', 'year']]


# Data Cleaning

In [6]:
#if you don't have the target, why are you even here?
df_private = df_private[~df_private['GBA6RTBK'].isna()]
#and again for public
df_public = df_public[~df_public['GBA6RTBK'].isna()]

In [7]:
#drop rows with no information
df_private.set_index(['UNITID'], inplace=True)
df_private.dropna(how='all')
#and again for public
df_public.set_index(['UNITID'], inplace=True)
df_public.dropna(how='all')

Unnamed: 0_level_0,Unnamed: 1_level_0,F1CORREV,F1TUFEPC,F1GVGCPC,F1PGGCPC,F1INVRPC,F1OTRVPC,F1TUFEFT,F1STAPFT,F1LCAPFT,F1GVGCFT,...,F1PBSVFT,F1ACSPFT,F1STSVFT,F1INSUFT,F1OTEXFT,F1SAFBPC,F1SALRPC,F1ENDMFT,F1EQUITR,GBA6RTBK
year,UNITID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2019,100654,,,,,,,,,,,...,,,,,,,,,,30.0
2019,100663,,,,,,,,,,,...,,,,,,,,,,56.0
2019,100706,,,,,,,,,,,...,,,,,,,,,,43.0
2019,100724,,,,,,,,,,,...,,,,,,,,,,31.0
2019,100751,,,,,,,,,,,...,,,,,,,,,,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,436818,26794102.0,19.0,14.0,3.0,0.0,27.0,2367.0,4604.0,0.0,1797.0,...,774.0,1490.0,516.0,1314.0,102.0,67.0,45.0,2943.0,52.0,43.0
2014,436827,23641855.0,19.0,14.0,3.0,0.0,27.0,7517.0,14623.0,0.0,5707.0,...,2459.0,4732.0,1640.0,4172.0,323.0,67.0,45.0,9347.0,52.0,50.0
2014,436836,18913484.0,19.0,14.0,3.0,0.0,27.0,3126.0,6082.0,0.0,2374.0,...,1023.0,1968.0,682.0,1735.0,134.0,67.0,45.0,3887.0,52.0,43.0
2014,448886,255065860.0,49.0,21.0,5.0,1.0,6.0,12430.0,4358.0,0.0,5353.0,...,614.0,3731.0,1234.0,2081.0,1900.0,55.0,41.0,8673.0,47.0,48.0


# Target Split 

In [8]:
#bin target to binary
df_private.loc[df_private['GBA6RTBK'] > 66,'abv_avg_gr'] = 1
df_private.loc[df_private['GBA6RTBK'] <= 66,'abv_avg_gr'] = 0
#and again for public
df_public.loc[df_public['GBA6RTBK'] > 58,'abv_avg_gr'] = 1
df_public.loc[df_public['GBA6RTBK'] <= 58,'abv_avg_gr'] = 0

In [10]:
#private
y_private = df_private['abv_avg_gr']
X_private = df_private.drop(columns = ['GBA6RTBK', 'abv_avg_gr'], axis = 1)
X_train_private, X_test_private, y_train_private, y_test_private = train_test_split(X_private, y_private, random_state = 42)
#and for public
y_public = df_public['abv_avg_gr']
X_public = df_public.drop(columns = ['GBA6RTBK', 'abv_avg_gr'], axis = 1)
X_train_public, X_test_public, y_train_public, y_test_public = train_test_split(X_public, y_public, random_state = 42)


In [11]:
#Concatenate to one dataframe, check for nan's
df_private = pd.concat([X_train_private, y_train_private], axis=1)
df_private.dropna(inplace=True)
#for public
df_public = pd.concat([X_train_public, y_train_public], axis=1)
df_public.dropna(inplace=True)

In [12]:
#split back
y_train_private = df_private['abv_avg_gr']
X_train_private = df_private.drop(columns = ['abv_avg_gr'], axis = 1)
#for public
y_train_public = df_public['abv_avg_gr']
X_train_public = df_public.drop(columns = ['abv_avg_gr'], axis = 1)


# First Simple Model

## Pipeline

In [13]:
#Set up pipeline for scaling continuous variables
continuous_pipeline_private= Pipeline(steps=[
    ('ss', StandardScaler())
])

trans_private = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline_private, X_train_private.columns),
])


In [14]:
#for public
continuous_pipeline_public = Pipeline(steps=[
    ('ss', StandardScaler())
])

trans_public = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline_public, X_train_public.columns),
])

## Dummy

In [15]:
#Pipeline for running the model
dummy_private = Pipeline(steps=[
    ('trans', trans_private),
    ('dummy', DummyClassifier(random_state = 42))
])
#Fitting and checking the score
dummy_private.fit(X_train_private, y_train_private)
dummy_private.score(X_train_private, y_train_private)




0.6579094466182225

In [16]:
#Public
dummy_public = Pipeline(steps=[
    ('trans', trans_public),
    ('dummy', DummyClassifier(random_state = 42))
])
#Fitting and checking the score
dummy_public.fit(X_train_public, y_train_public)
dummy_public.score(X_train_public, y_train_public)



0.6860524961399898

## Decision Tree

In [17]:
model_one_private = Pipeline(steps=[
    ('trans', trans_private),
    ('simple_dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))
])
#Fit model on all the data
model_one_private.fit(X_train_private, y_train_private)
#Grab predictions and print precision
y_pred_private = model_one_private.predict(X_train_private)
print("Training Score:" + str(accuracy_score(y_train_private, y_pred_private)))
#Run a cross validation to test for overfitting
scores_private = np.mean(cross_val_score(model_one_private, X_train_private, y_train_private, cv=5, scoring = 'accuracy'))
print("Validation Score:" + str(scores_private))

Training Score:0.8870877585243152
Validation Score:0.8602578427159433


In [18]:
model_one_public = Pipeline(steps=[
    ('trans', trans_public),
    ('simple_dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))
])
#Fit model on all the data
model_one_public.fit(X_train_public, y_train_public)
#Grab predictions and print precision
y_pred_public = model_one_public.predict(X_train_public)
print("Training Score:" + str(accuracy_score(y_train_public, y_pred_public)))
#Run a cross validation to test for overfitting
scores_public = np.mean(cross_val_score(model_one_public, X_train_public, y_train_public, cv=5, scoring = 'accuracy'))
print("Validation Score:" + str(scores_public))

Training Score:0.9104477611940298
Validation Score:0.8769869875175577


In [20]:
#Feature Importance
important_private = []
for name, importance in zip(X_train_private.columns, model_one_private['simple_dt'].feature_importances_):
    if importance > 0:
        important_private.append((name, importance))

important_private

[('F2CORREV', 0.006054950617189415),
 ('F2TUFEPC', 0.013407390652347988),
 ('F2OTRVPC', 0.013430638675368735),
 ('F2TUFEFT', 0.008722367009712668),
 ('F2GVGCFT', 0.012905001459879275),
 ('F2INVRFT', 0.006195307141505239),
 ('F2OTRVFT', 0.005654672687909007),
 ('F2COREXP', 0.03603246084599486),
 ('F2INSUPC', 0.01102122345264889),
 ('F2INSTFT', 0.7309876101712495),
 ('F2RSRCFT', 0.0035962496458839957),
 ('F2ACSPFT', 0.029313497014764826),
 ('F2STSVFT', 0.00840076935047876),
 ('F2INSUFT', 0.014291372166103076),
 ('F2OTEXFT', 0.005680795800275848),
 ('F2SAFBPC', 0.0053079112553283845),
 ('F2ENDMFT', 0.07037372071170604),
 ('F2EQUITR', 0.018624061341653415)]

In [21]:
#For Public
important_public = []
for name, importance in zip(X_train_public.columns, model_one_public['simple_dt'].feature_importances_):
    if importance > 0:
        important_public.append((name, importance))
important_public

[('F1CORREV', 0.12257084383154572),
 ('F1TUFEPC', 0.004788588835288123),
 ('F1GVGCPC', 0.010522376251136377),
 ('F1OTRVPC', 0.034999036735456075),
 ('F1TUFEFT', 0.37303198300219736),
 ('F1STAPFT', 0.083204331340647),
 ('F1GVGCFT', 0.05313300812894642),
 ('F1COREXP', 0.04184371979847999),
 ('F1INSTPC', 0.021307380271763977),
 ('F1RSRCPC', 0.020733564915404252),
 ('F1PBSVPC', 0.011588384981397243),
 ('F1ACSPPC', 0.006129393709168796),
 ('F1STSVPC', 0.0073171664319111665),
 ('F1INSUPC', 0.026747426964485663),
 ('F1OTEXPC', 0.005804350103379544),
 ('F1INSTFT', 0.023537537402579668),
 ('F1ACSPFT', 0.020206016695642152),
 ('F1STSVFT', 0.02408290586208681),
 ('F1OTEXFT', 0.01986580282525244),
 ('F1SAFBPC', 0.019478661886611164),
 ('F1ENDMFT', 0.03226723299624813),
 ('F1EQUITR', 0.03684028703037206)]