# Feature Creation

# Modeling

In [39]:
#Import all the needed modules
import numpy as np
import pandas as pd
from functools import reduce
import statsmodels.api as sm

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

In [54]:
df = pd.read_csv("data/Gr_Lakes_public_financials.csv").iloc[:, 1:]
df.set_index(['year','UNITID'], inplace=True)

In [55]:
# df = df.corr().abs().stack().reset_index().sort_values(0, ascending=False)
# df = X.corr().abs().stack().reset_index().sort_values(0, ascending=False)
# df['pairs'] = list(zip(df.level_0, df.level_1))
# df.set_index(['pairs'], inplace = True)
# df.drop(columns=['level_1', 'level_0'], inplace = True)
# df.columns = ['cc']
# df.drop_duplicates(inplace=True)
# df[(df.cc>.75) & (df.cc <1)]


In [56]:
#Create X and y dataframes and train-test split them
y = df['GBA6RTBK']
X = df.drop(columns = ['GBA6RTBK'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [57]:
#Concatenate to one dataframe, check for nan's
df = pd.concat([X_train, y_train], axis=1)
df.isna().sum().sort_values(ascending=False)

F1M07     726
F1M08     726
F1M06     726
F1M05     726
F1A19     522
         ... 
F1C012     12
F1C021     12
F1C022     12
F1C031     12
F1E06      12
Length: 211, dtype: int64

In [58]:
df.shape

(780, 211)

In [59]:
#drop columns with 40% missing values
res2 = df.columns[df.isnull().sum() > 311]
df.drop(res2, inplace=True, axis=1)
df.dropna(inplace=True)

In [60]:
#split back
y_train = df['GBA6RTBK']
X_train = df.drop(columns = ['GBA6RTBK'], axis = 1)



In [61]:
#bin target to binary
y_train.loc[y_train > .604] = 1
y_train.loc[y_train < .604] = 0

In [77]:
X_train = X_train[['F1B19','F1STSVPC','F1TUFEFT','F1E09','F1A14','F1OTEXFT']]

In [78]:
#Set up pipeline for scaling continuous variables
continuous_pipeline = Pipeline(steps=[
    ('ss', StandardScaler())
])

In [79]:
trans = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline, X_train.columns),
])

In [80]:
model_one = Pipeline(steps=[
    ('trans', trans),
    ('simple_dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))
])
#Fit model on all the data
model_one.fit(X_train, y_train)
#Grab predictions and print precision
y_pred = model_one.predict(X_train)
print("Training Score:" + str(accuracy_score(y_train, y_pred)))
#Run a cross validation to test for overfitting
scores = np.mean(cross_val_score(model_one, X_train, y_train, cv=5, scoring = 'accuracy'))
print("Validation Score:" + str(scores))

Training Score:1.0
Validation Score:0.9805750350631136


In [81]:
for name, importance in zip(X_train.columns, model_one['simple_dt'].feature_importances_):
    print(name, importance)

F1B19 0.35259178673297703
F1STSVPC 0.03081664371763838
F1TUFEFT 0.26798418972332017
F1E09 0.1955551086082057
F1A14 0.0909712843581109
F1OTEXFT 0.06208098685974785


In [82]:
X_train.corr()

Unnamed: 0,F1B19,F1STSVPC,F1TUFEFT,F1E09,F1A14,F1OTEXFT
F1B19,1.0,-0.518087,0.648069,0.597588,0.854113,0.271116
F1STSVPC,-0.518087,1.0,-0.583611,-0.366469,-0.462333,-0.126459
F1TUFEFT,0.648069,-0.583611,1.0,0.641629,0.690133,0.167303
F1E09,0.597588,-0.366469,0.641629,1.0,0.708646,0.10783
F1A14,0.854113,-0.462333,0.690133,0.708646,1.0,0.274963
F1OTEXFT,0.271116,-0.126459,0.167303,0.10783,0.274963,1.0


# Phase 3 Code

In [10]:
#pipeline
pipeline = Pipeline(steps=[
    ('si', SimpleImputer(missing_values=np.nan))
])

# column transformer
trans = ColumnTransformer(transformers=[
    ('pipeline', pipeline, X_train.columns)
])

#Pipeline for running the model
dummy = Pipeline(steps=[
    ('trans', trans),
    ('dummy', DummyClassifier(random_state = 42, strategy = '' ))
])

SyntaxError: invalid syntax (<ipython-input-10-3fc4cefdca38>, line 14)

In [None]:


#Bin and fill in nulls in installer
inst_five = X_train.installer.value_counts(sort = True, ascending = False)[:5]
inst_list = list(inst_five.index)
for idx, value in enumerate(inst_list):
    inst_list[idx] = value.lower()
X_train['installer'] = X_train['installer'].apply(install_bin)
X_test['installer'] = X_test['installer'].apply(install_bin)
      
#Bin and fill in nulls in scheme_management
scheme_eight = X_train.scheme_management.value_counts(sort = True, ascending = False)[:9]
scheme_list = list(scheme_eight.index)
for idx, value in enumerate(scheme_list):
    scheme_list[idx] = value.lower() 
X_train['scheme_management'] = X_train['scheme_management'].apply(scheme_bin)
X_test['scheme_management'] = X_test['scheme_management'].apply(install_bin)

#Create categorical and continuous feature split
X_train_cat = X_train.select_dtypes('object')
X_train_cont = X_train.select_dtypes(['float64', 'int64'])

#Set up pipeline for scaling continuous variables
continuous_pipeline = Pipeline(steps=[
    ('ss', StandardScaler())
])

#Set up pipeline for encoding categorical variables
categorical_pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first'))
])

#Bind the scaling and encoding process together
trans = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline, X_train_cont.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

#Pipeline for running the model
dummy = Pipeline(steps=[
    ('trans', trans),
    ('dummy', DummyClassifier(random_state = 42, strategy = 'most_frequent'))
])

#Fitting and checking the score
dummy.fit(X_train, y_train)
dummy.score(X_train, y_train)

In [None]:
#Pipeline for decision tree
model_one = Pipeline(steps=[
    ('trans', trans),
    ('simple_dt', DecisionTreeClassifier(max_depth = 5, random_state = 42))
])

#Fit model on all the data
model_one.fit(X_train, y_train)
#Grab predictions and print precision
y_pred = model_one.predict(X_train)
print("Training Score:" + str(precision_score(y_train, y_pred)))
#Run a cross validation to test for overfitting
scores = np.mean(cross_val_score(model_one, X_train, y_train, cv=5, scoring = 'precision'))
print("Validation Score:" + str(scores))

In [None]:
for name, importance in zip(X_train.columns, model_one['simple_dt'].feature_importances_):
    print(name, importance)

In [None]:
#Create new features dataframe based on results above
X = df_trim[['amount_tsh', 'permit', 'installer', 'extraction_type_class']]

#Split the data again
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

#Bin installer again
inst_five = X_train.installer.value_counts(sort = True, ascending = False)[:5]
inst_list = list(inst_five.index)
for idx, value in enumerate(inst_list):
    inst_list[idx] = value.lower()
X_train['installer'] = X_train['installer'].apply(install_bin)
X_test['installer'] = X_test['installer'].apply(install_bin)

#Separate out which columns are categorical or continuous
X_train_cat = ['permit', 'installer', 'extraction_type_class']
X_train_cont = ['amount_tsh']

#Adjust transformer to account for change in assigning X_train_cont
trans = ColumnTransformer(transformers=[
    ('continuous', continuous_pipeline, X_train_cont),
    ('categorical', categorical_pipeline, X_train_cat)
])

#Pipeline for logistic regression
logreg = Pipeline(steps=[
    ('trans', trans),
    ('logr', LogisticRegression(random_state = 42))
])

#Fit the model
logreg.fit(X_train, y_train)
#Print precision or training and validation sets
y_pred = logreg.predict(X_train)
print("Training Score:" + str(precision_score(y_train, y_pred)))
scores = np.mean(cross_val_score(logreg, X_train, y_train, cv=5, scoring = 'precision'))
print("Validation Score:" + str(scores))

In [3]:
#check assumptions

In [4]:
#Grab probabilities and calculate log odds
pred = logreg.predict_proba(X_train)[:, 0]
log_odds = np.log(pred / (1 - pred))
#Plot log odds versus continuous variable to check for linearity
plt.scatter(x = X_train['amount_tsh'].values, y = log_odds)
plt.title("Logistic Regression Assumption Test")
plt.xlabel("amount_tsh")
plt.ticklabel_format(axis='x', style='sci', scilimits=(0,0))
plt.ylabel("Log-odds")
plt.show();

NameError: name 'logreg' is not defined

In [None]:
#Build pipeline for random forest
ensemble = Pipeline(steps=[
    ('trans', trans),
    ('rfc', RandomForestClassifier(random_state = 42))
])

#Fit the model
ensemble.fit(X_train, y_train)
#Print out precision for training and validation
y_pred = ensemble.predict(X_train)
print("Training Score:" + str(precision_score(y_train, y_pred)))
scores = np.mean(cross_val_score(ensemble, X_train, y_train, cv=5))
print("Validation Score:" + str(scores))

In [None]:
#Avoid a long runtime, the code is included but hashed out
"""
#Create parameters to test
params = {
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': [100, 300, 500],
    'rfc__min_samples_split': [2, 5, 10]
}

#Fit gridsearch on model and prints out the best parameters
search = GridSearchCV(ensemble, param_grid = params, scoring = 'precision')
search.fit(X_train, y_train)
search.best_params_
""";

In [None]:
ensemble_tuned = Pipeline(steps=[
    ('trans', trans),
    ('rfc', RandomForestClassifier(criterion = 'entropy', min_samples_split = 5, 
                                   n_estimators = 300, random_state = 42))
])

#Fit and print precision for tuned model
ensemble_tuned.fit(X_train, y_train)
y_pred = ensemble_tuned.predict(X_train)
print("Training Score:" + str(precision_score(y_train, y_pred)))
scores = np.mean(cross_val_score(ensemble_tuned, X_train, y_train, cv=5))
print("Validation Score:" + str(scores))

# evaluation

In [None]:
print("Training Accuracy:" + str(logreg.score(X_train, y_train)))
scores = np.mean(cross_val_score(logreg, X_train, y_train, cv=5))
print("Validation Accuracy:" + str(scores))

In [None]:
#Generate precision score for test set
test_pred = logreg.predict(X_test)
print("Test Score:" + str(precision_score(y_test, test_pred)))
print("Tets Accuracy:" + str(logreg.score(X_test, y_test)))