# Imports 

In [1]:
import pyodbc
import numpy as np
import pandas as pd
from functools import reduce
import statsmodels.api as sm

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import  accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier


# Data Retrieval

In [4]:
df = pd.read_csv("data/Gr_Lakes_public_financials.csv").iloc[:, 1:]
df.set_index(['year','UNITID'], inplace=True)

# Data Cleaning

In [5]:
#drop columns with 40% missing values
res2 = df.columns[df.isnull().sum() > 311]
df.drop(res2, inplace=True, axis=1)
df.dropna(inplace=True)

# Target Split 

In [6]:
y = df['GBA6RTBK']
X = df.drop(columns = ['GBA6RTBK'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [7]:
#Concatenate to one dataframe, check for nan's
df = pd.concat([X_train, y_train], axis=1)
df.isna().sum().sort_values(ascending=False)
df.dropna(inplace=True)

In [8]:
#split back
y_train = df['GBA6RTBK']
X_train = df.drop(columns = ['GBA6RTBK'], axis = 1)

In [9]:
#bin target to binary
y_train.loc[y_train > .604] = 1
y_train.loc[y_train < .604] = 0

# First Simple Model

## Pipeline

In [10]:
#Set up pipeline for scaling continuous variables
continuous_pipeline = Pipeline(steps=[
    ('ss', StandardScaler())
])

trans = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline, X_train.columns),
])

## Dummy

In [11]:
#Pipeline for running the model
dummy = Pipeline(steps=[
    ('trans', trans),
    ('dummy', DummyClassifier(random_state = 42))
])
#Fitting and checking the score
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)



0.9653679653679653

## Decision Tree

In [12]:
model_one = Pipeline(steps=[
    ('trans', trans),
    ('simple_dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))
])
#Fit model on all the data
model_one.fit(X_train, y_train)
#Grab predictions and print precision
y_pred = model_one.predict(X_train)
print("Training Score:" + str(accuracy_score(y_train, y_pred)))
#Run a cross validation to test for overfitting
scores = np.mean(cross_val_score(model_one, X_train, y_train, cv=5, scoring = 'accuracy'))
print("Validation Score:" + str(scores))

Training Score:0.9956709956709957
Validation Score:0.9566386161757832


In [17]:
#Feature Importance
for name, importance in zip(X_train.columns, model_one['simple_dt'].feature_importances_):
    print(name, importance)


F1A01 0.0
F1A31 0.0
F1A04 0.0
F1A05 0.0
F1A06 0.0
F1A07 0.0
F1A08 0.0
F1A09 0.0
F1A10 0.0
F1A11 0.0
F1A12 0.0
F1A13 0.0
F1A14 0.0
F1A15 0.0
F1A16 0.0
F1A17 0.0
F1A18 0.0
F1A214 0.0
F1A224 0.0
F1A234 0.0
F1A324 0.0
F1A274 0.0
F1A27T4 0.0
F1A284 0.0
F1A334 0.0
F1A344 0.0
F1D01 0.0
F1D02 0.0
F1D03 0.0
F1D04 0.10368272304833724
F1D05 0.14841525401176253
F1D06 0.0
F1B01 0.0
F1B02 0.09221066077653887
F1B03 0.0
F1B04 0.0
F1B04A 0.0
F1B04B 0.0
F1B05 0.0
F1B06 0.0
F1B26 0.0
F1B07 0.0
F1B08 0.0
F1B09 0.0
F1B10 0.0
F1B11 0.0
F1B12 0.0
F1B13 0.0
F1B14 0.0
F1B15 0.0
F1B16 0.0
F1B17 0.0
F1B18 0.0
F1B19 0.0
F1B27 0.0
F1B20 0.0
F1B21 0.0
F1B22 0.0
F1B23 0.0
F1B24 0.0
F1B25 0.0
F1C011 0.0
F1C012 0.0
F1C021 0.0
F1C022 0.0
F1C031 0.0
F1C032 0.0
F1C051 0.0
F1C052 0.0
F1C061 0.0
F1C062 0.0
F1C071 0.0
F1C072 0.0
F1C101 0.0
F1C111 0.0
F1C112 0.0
F1C121 0.0
F1C122 0.0
F1C131 0.0
F1C132 0.0
F1C141 0.2338896289814328
F1C142 0.0
F1C191 0.0
F1C192 0.0
F1C193 0.0
F1E01 0.0
F1E02 0.0
F1E03 0.0
F1E04 0.0
F1E05 0.0
F

In [None]:
important = ['F1D04','F1D05','F1B02','F1C141','F1E07','F1TUFEFT','F1STSVFT']