# Problem Statement:

Identifying pediatric obesity from Electronic Health Record data is a challenge. It is especially arduous given that ICD10 codes for obesity are undercoded. In this exercise, I intend to develop a machine learning methodology that uses patient-level information for demographic characteristics(sex, age, region), type of insurance, co-occurring conditions and medical expenditures associated with weight status to predict obesity. Data includes individuals aged 2 to 19 years and has been obtained from deidentified IQVIA's AEMR and PharMetrics Plus commercial claims data.

This is a work in progress as model needs to be futher improved.


In [1]:
#import the libraries
import pandas as pd
import pandasql as ps
#display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
import os
import sys

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import seaborn as sns

import textwrap
from IPython.display import display

from patsy import dmatrices
import sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score

import pygwalker as pyg

In [3]:
# Enable inline plotting for graphics
%matplotlib inline

In [4]:
# Get Version information
print(textwrap.fill(sys.version),'\n')
print("Pandas version: {0}".format(pd.__version__),'\n')
print("Matplotlib version: {0}".format(matplotlib.__version__),'\n')
print("Numpy version: {0}".format(np.__version__),'\n')
print("Seaborn version: {0}".format(sns.__version__),'\n')

3.7.11 (default, Jul 27 2021, 09:42:29) [MSC v.1916 64 bit (AMD64)] 

Pandas version: 1.3.4 

Matplotlib version: 3.5.0 

Numpy version: 1.20.3 

Seaborn version: 0.11.1 



In [5]:
# So all output comes through from Ipython
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# Working Directory
print(os.getcwd())

C:\Users\run1\OneDrive - CDC\Learning Python


In [7]:
# Set New Working Directory 
# the try except tells Python to try to run the code
# and if an error occurs, print a message
#try:
os.chdir(r"C:\Users\run1\OneDrive - CDC\IQVIA on My Disk")
#except:
 #   print("ERROR: You probably forgot to change the directory path!")

In [8]:
print("My working directory:\n" + os.getcwd())

My working directory:
C:\Users\run1\OneDrive - CDC\IQVIA on My Disk


In [9]:
df = pd.read_stata("Full_working_matched_cleaned_non-preg_chronic_25March22Cut_2018.dta")

In [10]:
df = df[['obese','Cancer', 'Congenital_mal_n', 'Congenital_mal_c','Congenital_mal_m', 'Sickle_cell', 'Congenital_HD',
       'Esophageal_atresia', 'EoE', 'Seizure', 'Brain_injury','Slow_fetal_growth', 'Technology_dep', 'Gastro_eso_reflux',
       'Lactose_int', 'Milk_protein_int', 'Malabs', 'Pancreatic_ins','Cirrhosis', 'Renal_tub', 'Diabetes_ins', 'Chronic_ren_ins',
       'Kartagener_syn', 'Hypothyroidism', 'Adrenal_ins','Growth_horm_def', 'Inborn_err_met', 'Phenylketonuria','Maple_syr_urine_dis', 'Fructose_int', 'Gauchers_dis',
       'Urea_cyc_dis', 'Fetal_alc_syn', 'Hyper_Ige_syn','Common_var_imm_def', 
       'inpatient_visit', 'total_inpatient_visit', 'agedays', 
       'gender','patient_state','oop_positive_exp', 'agecat1', 'agecat2','agecat3', 'agecat4', 'ethnicity_cat1', 'ethnicity_cat2',
       'ethnicity_cat3', 'ethnicity_cat4', 'ethnicity_cat5','ethnicity_cat6', 'payert_cat1', 'payert_cat2', 'payert_cat3',
       'region_cat1', 'region_cat2', 'region_cat3', 'region_cat4','region_cat5', 'total_inpatient_cost', 'total_outpatient_cost','oop_inpatient_cost', 'oop_outpatient_cost']]

In [11]:
with pd.option_context("display.max_columns", None):
    display(df)

Unnamed: 0,obese,Cancer,Congenital_mal_n,Congenital_mal_c,Congenital_mal_m,Sickle_cell,Congenital_HD,Esophageal_atresia,EoE,Seizure,Brain_injury,Slow_fetal_growth,Technology_dep,Gastro_eso_reflux,Lactose_int,Milk_protein_int,Malabs,Pancreatic_ins,Cirrhosis,Renal_tub,Diabetes_ins,Chronic_ren_ins,Kartagener_syn,Hypothyroidism,Adrenal_ins,Growth_horm_def,Inborn_err_met,Phenylketonuria,Maple_syr_urine_dis,Fructose_int,Gauchers_dis,Urea_cyc_dis,Fetal_alc_syn,Hyper_Ige_syn,Common_var_imm_def,inpatient_visit,total_inpatient_visit,agedays,gender,patient_state,oop_positive_exp,agecat1,agecat2,agecat3,agecat4,ethnicity_cat1,ethnicity_cat2,ethnicity_cat3,ethnicity_cat4,ethnicity_cat5,ethnicity_cat6,payert_cat1,payert_cat2,payert_cat3,region_cat1,region_cat2,region_cat3,region_cat4,region_cat5,total_inpatient_cost,total_outpatient_cost,oop_inpatient_cost,oop_outpatient_cost
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4923,Female,FL,1.0,0,0,1,0,0,0,1,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0,91.320000,0.0,91.320000
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1914,Female,IL,1.0,1,0,0,0,0,0,0,0,0,1,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,730.290010,0.0,93.650005
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6233,Male,TN,1.0,0,0,1,0,0,0,0,0,0,1,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0,1188.730002,0.0,642.979987
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5636,Male,FL,1.0,0,0,1,0,1,0,0,0,0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,16445.579994,0.0,2491.529915
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5145,Male,NC,1.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0.0,0.0,0.0,1.0,0.0,0.0,415.149997,0.0,159.780001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205871,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4930,Female,MN,1.0,0,0,1,0,0,0,1,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,4909.750080,0.0,1212.949993
205872,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7031,Female,KY,1.0,0,0,0,1,0,0,1,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0,2771.520005,0.0,1852.400000
205873,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,772,Male,FL,1.0,1,0,0,0,0,1,0,0,0,0,1,0,0,0.0,0.0,0.0,1.0,0.0,0.0,1779.329996,0.0,264.089996
205874,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7033,Female,IN,1.0,0,0,0,1,0,0,1,0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,20125.609992,0.0,2105.559931


In [12]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205876 entries, 0 to 205875
Data columns (total 63 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   obese                  205876 non-null  int8    
 1   Cancer                 205876 non-null  float64 
 2   Congenital_mal_n       205876 non-null  float64 
 3   Congenital_mal_c       205876 non-null  float64 
 4   Congenital_mal_m       205876 non-null  float64 
 5   Sickle_cell            205876 non-null  float64 
 6   Congenital_HD          205876 non-null  float64 
 7   Esophageal_atresia     205876 non-null  float64 
 8   EoE                    205876 non-null  float64 
 9   Seizure                205876 non-null  float64 
 10  Brain_injury           205876 non-null  float64 
 11  Slow_fetal_growth      205876 non-null  float64 
 12  Technology_dep         205876 non-null  float64 
 13  Gastro_eso_reflux      205876 non-null  float64 
 14  Lactose_int         

Unnamed: 0,obese,Cancer,Congenital_mal_n,Congenital_mal_c,Congenital_mal_m,Sickle_cell,Congenital_HD,Esophageal_atresia,EoE,Seizure,Brain_injury,Slow_fetal_growth,Technology_dep,Gastro_eso_reflux,Lactose_int,Milk_protein_int,Malabs,Pancreatic_ins,Cirrhosis,Renal_tub,Diabetes_ins,Chronic_ren_ins,Kartagener_syn,Hypothyroidism,Adrenal_ins,Growth_horm_def,Inborn_err_met,Phenylketonuria,Maple_syr_urine_dis,Fructose_int,Gauchers_dis,Urea_cyc_dis,Fetal_alc_syn,Hyper_Ige_syn,Common_var_imm_def,inpatient_visit,total_inpatient_visit,agedays,oop_positive_exp,agecat1,agecat2,agecat3,agecat4,ethnicity_cat1,ethnicity_cat2,ethnicity_cat3,ethnicity_cat4,ethnicity_cat5,ethnicity_cat6,payert_cat1,payert_cat2,payert_cat3,region_cat1,region_cat2,region_cat3,region_cat4,region_cat5,total_inpatient_cost,total_outpatient_cost,oop_inpatient_cost,oop_outpatient_cost
count,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205876.0,205826.0,205826.0,205826.0,205826.0,205826.0,205876.0,205876.0,205876.0,205876.0
mean,0.155773,0.001787,0.002283,0.004658,0.001253,0.000306,0.000437,6.8e-05,0.001899,0.005387,0.015446,3.9e-05,0.002006,0.017953,0.000991,0.001331,0.002808,0.000471,0.000107,4.9e-05,0.00016,0.000136,5.3e-05,0.006577,0.000355,0.002681,1.5e-05,0.00016,1.5e-05,5e-06,1e-05,4.9e-05,0.000209,1.5e-05,0.000209,0.019939,0.026739,4230.571509,0.885052,0.193617,0.306306,0.3618,0.138277,0.054951,0.02519,0.688303,0.005338,0.022169,0.20405,0.349312,0.082827,0.567861,0.150938,0.344208,1e-05,0.421366,0.083478,662.5742,2480.736079,45.415488,587.837128
std,0.362641,0.042241,0.047726,0.068092,0.035378,0.01749,0.020904,0.008246,0.043539,0.073197,0.123319,0.006234,0.044744,0.132779,0.031463,0.036457,0.052912,0.021701,0.010337,0.006969,0.01266,0.011661,0.007309,0.08083,0.018827,0.051711,0.003817,0.01266,0.003817,0.002204,0.003117,0.006969,0.014451,0.003817,0.014451,0.13993,0.2376,1903.691404,0.319311,0.395133,0.460959,0.480523,0.345192,0.227884,0.156702,0.463188,0.072868,0.147232,0.403007,0.476754,0.275621,0.495375,0.357989,0.475111,0.003117,0.493779,0.276604,11494.49,6992.525004,1342.91503,1426.562868
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,731.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-34708.279224,-5219.890446
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2618.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,401.740001,0.0,50.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4383.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,854.115,0.0,199.945003
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5858.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2121.352502,0.0,640.000024
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,22.0,7304.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1901839.0,667090.227844,336561.997265,309166.730882


In [13]:
# Create formula for all variables in the model
vars_remove= ['obese']
vars_left = set(df.columns) - set(vars_remove)
formula = "obese ~ " + "+ ".join(vars_left)
formula

'obese ~ Fructose_int+ Cancer+ Milk_protein_int+ Common_var_imm_def+ agecat4+ ethnicity_cat6+ Congenital_mal_m+ Pancreatic_ins+ ethnicity_cat2+ Gauchers_dis+ Adrenal_ins+ Lactose_int+ Inborn_err_met+ total_outpatient_cost+ Congenital_HD+ Cirrhosis+ Chronic_ren_ins+ Malabs+ Urea_cyc_dis+ Technology_dep+ inpatient_visit+ Phenylketonuria+ EoE+ agedays+ region_cat5+ oop_inpatient_cost+ oop_outpatient_cost+ ethnicity_cat5+ Esophageal_atresia+ Congenital_mal_n+ ethnicity_cat4+ Maple_syr_urine_dis+ ethnicity_cat1+ Diabetes_ins+ payert_cat1+ Slow_fetal_growth+ Fetal_alc_syn+ total_inpatient_visit+ agecat1+ ethnicity_cat3+ patient_state+ oop_positive_exp+ agecat2+ Hyper_Ige_syn+ region_cat3+ Congenital_mal_c+ Sickle_cell+ agecat3+ region_cat4+ Kartagener_syn+ payert_cat3+ Brain_injury+ Renal_tub+ Seizure+ region_cat2+ total_inpatient_cost+ Gastro_eso_reflux+ payert_cat2+ Hypothyroidism+ region_cat1+ gender+ Growth_horm_def'

In [14]:
## use Patsy to create model matrices
from patsy import dmatrices
Y, X = dmatrices(formula, df)

In [15]:
Y


DesignMatrix with shape (205824, 1)
  obese
      0
      0
      1
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
  [205794 rows omitted]
  Terms:
    'obese' (column 0)
  (to view full data, use np.asarray(this_obj))

DesignMatrix with shape (205824, 112)
  Columns:
    ['Intercept',
     'patient_state[T.AL]',
     'patient_state[T.AR]',
     'patient_state[T.AZ]',
     'patient_state[T.CA]',
     'patient_state[T.CO]',
     'patient_state[T.CT]',
     'patient_state[T.DC]',
     'patient_state[T.DE]',
     'patient_state[T.FL]',
     'patient_state[T.GA]',
     'patient_state[T.HI]',
     'patient_state[T.IA]',
     'patient_state[T.ID]',
     'patient_state[T.IL]',
     'patient_state[T.IN]',
     'patient_state[T.KS]',
     'patient_state[T.KY]',
     'patient_state[T.LA]',
     'patient_state[T.MA]',
     'patient_state[T.MD]',
     'patient_state[T.ME]',
     'patient_state[T.MI]',
     'patient_state[T.MN]',
     'patient_state[T.MO]',
     'patient_state[T.MS]',
     'patient_state[T.MT]',
     'patient_state[T.NC]',
     'patient_state[T.ND]',
     'patient_state[T.NE]',
     'patient_state[T.NH]',
     'patient_state[T.NJ]',
     'patient_state[T.NM]',
     'patient_state[T.NV]',
     'pat

In [82]:
X

DesignMatrix with shape (205824, 112)
  Columns:
    ['Intercept',
     'patient_state[T.AL]',
     'patient_state[T.AR]',
     'patient_state[T.AZ]',
     'patient_state[T.CA]',
     'patient_state[T.CO]',
     'patient_state[T.CT]',
     'patient_state[T.DC]',
     'patient_state[T.DE]',
     'patient_state[T.FL]',
     'patient_state[T.GA]',
     'patient_state[T.HI]',
     'patient_state[T.IA]',
     'patient_state[T.ID]',
     'patient_state[T.IL]',
     'patient_state[T.IN]',
     'patient_state[T.KS]',
     'patient_state[T.KY]',
     'patient_state[T.LA]',
     'patient_state[T.MA]',
     'patient_state[T.MD]',
     'patient_state[T.ME]',
     'patient_state[T.MI]',
     'patient_state[T.MN]',
     'patient_state[T.MO]',
     'patient_state[T.MS]',
     'patient_state[T.MT]',
     'patient_state[T.NC]',
     'patient_state[T.ND]',
     'patient_state[T.NE]',
     'patient_state[T.NH]',
     'patient_state[T.NJ]',
     'patient_state[T.NM]',
     'patient_state[T.NV]',
     'pat

In [16]:
## Set default figure size to be larger
## this may only work in matplotlib 2.0+!

matplotlib.rcParams['figure.figsize'] = [10.0, 6.0]
matplotlib.rcParams['font.serif'] = "Georgia"
matplotlib.rcParams['font.family'] = "serif"

## Enable multiple outputs from jupyter cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings('ignore')

In [17]:
## Split Data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,
test_size=0.20,
random_state=0)

In [18]:
## Confirm dimensions of covariate matrix
X_train.shape
X_test.shape

(164659, 112)

(41165, 112)

In [19]:
## Confirm dimensions of target matrix
y_train.shape
y_test.shape

(164659, 1)

(41165, 1)

# Defining Function to print detailed classification results
Before we run any model lets define a function to print full classification report and then create a dictionary to store results.

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

# First Model: Logistic Regression

In [21]:
## import linear model
from sklearn import linear_model

## Define model parameters
clf = linear_model.LogisticRegression(penalty='none', solver='newton-cg')

# Penalty - see here https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

## fit model using data with .fit
clf.fit(X_train, y_train)

LogisticRegression(penalty='none', solver='newton-cg')

In [22]:
# Get coefficients
clf.coef_

array([[-6.96068493e-01, -6.50058809e-02,  4.02520563e-02,
         4.31888354e-02, -1.90229357e-01, -3.48804701e-01,
         4.70787330e-03, -1.69419675e-01, -4.43186371e-02,
        -2.17659075e-01,  9.21805990e-02,  2.25939147e-01,
        -1.75686305e-02, -2.09089559e-01, -1.93148126e-01,
         9.68148245e-02, -3.41344179e-02,  2.97921855e-01,
         1.93088015e-01, -4.59299420e-01, -2.68083909e-01,
         2.41858424e-01, -1.11812605e-01, -1.16189871e-01,
         2.28014605e-01,  1.02577376e-01,  1.00357834e-01,
        -7.12450190e-02,  1.78098530e-01,  6.68757453e-02,
        -5.84194195e-02, -1.97513230e-01, -2.04356567e-03,
        -1.56542492e-01,  4.16437185e-02,  8.78164891e-02,
        -3.74927333e-02,  2.54905524e-01, -9.08244336e-03,
         6.57032153e-02,  6.28830520e-02, -1.78239330e-02,
        -1.40146409e-02, -1.03938833e-01,  8.15474152e-02,
        -1.92485041e-01,  1.86872805e-01,  1.86650972e-01,
        -3.20812434e-01,  3.09538563e-01,  2.52947834e-0

In [23]:
## get accuracy score?
clf.score(X_train, y_train)

0.844964441664288

In [24]:
## get confusion matrix on training data
cf1 = confusion_matrix(y_train, clf.predict(X_train))
cf1

array([[139129,      3],
       [ 25525,      2]], dtype=int64)

In [25]:
## get confusion matrix on test data
cf1_test = confusion_matrix(y_test, clf.predict(X_test))
cf1_test

array([[34627,     1],
       [ 6532,     5]], dtype=int64)

In [26]:
## identify accuracy by hand from confusion matrix
accuracy = (cf1[0, 0] + cf1[1, 1])/np.sum(cf1)
print(f'Model accuracy was {accuracy:.3f}')

Model accuracy was 0.845


In [27]:
## balanced accuracy
balanced_accuracy_score(y_train, clf.predict(X_train))

0.5000283930789894

In [28]:
## Get kappa
sklearn.metrics.cohen_kappa_score(y_train,
clf.predict(X_train))

9.595229134262695e-05

In [29]:
## get many classification metrics
print(sklearn.metrics.classification_report(y_train, clf.predict(X_train)))

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.92    139132
         1.0       0.40      0.00      0.00     25527

    accuracy                           0.84    164659
   macro avg       0.62      0.50      0.46    164659
weighted avg       0.78      0.84      0.77    164659



# Creating Dictionary to store results

In [30]:
## Create dict to store all these results:
result_scores = {}

## Score the Model on Training and Testing Set
result_scores['Logistic'] = \
(accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)))

In [31]:
## Create Function to Print Results
def get_results(x1):
    print("\n{0:20} {1:4} {2:4}".format('Model','Train','Test'))
    print('-------------------------------------------')
    for i in x1.keys():
        print("{0:20} {1:<6.4} {2:<6.4}".format(i,x1[i][0],x1[i][1]))

In [32]:
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413


# Logistic Regression with L1 Penalty

In [33]:
## Logistic Regression with l1 penalty
clf = linear_model.LogisticRegression(penalty='l1', # specify penalty
C=1,
solver='liblinear')
clf.fit(X_train, y_train)

## get confusion matrix
confusion_matrix(y_train, clf.predict(X_train))

LogisticRegression(C=1, penalty='l1', solver='liblinear')

array([[139127,      5],
       [ 25518,      9]], dtype=int64)

In [34]:
## get accuracy
accuracy_score(y_train,clf.predict(X_train))
## Get kappa
sklearn.metrics.cohen_kappa_score(y_train,clf.predict(X_train))
## get classification metrics
print(sklearn.metrics.classification_report(y_train, clf.predict(X_train)))
## get classification metrics on test
print(sklearn.metrics.classification_report(y_test, clf.predict(X_test)))

0.8449948074505493

0.000534884940184055

              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92    139132
         1.0       0.64      0.00      0.00     25527

    accuracy                           0.84    164659
   macro avg       0.74      0.50      0.46    164659
weighted avg       0.81      0.84      0.77    164659

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     34628
         1.0       0.86      0.00      0.00      6537

    accuracy                           0.84     41165
   macro avg       0.85      0.50      0.46     41165
weighted avg       0.84      0.84      0.77     41165



In [35]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L1'] = \
(accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)))

# Save the Model results in a function
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413


# Scaling/Pipeline
Scaling is used when we use shrinkage methods

In [36]:
## LASSO regression, set alpha
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
clf = linear_model.LogisticRegression(penalty='l1',
C=0.5,
solver='liblinear')
scaler = preprocessing.StandardScaler().fit(X_train)
pipe1 = Pipeline([("scale", scaler), ("LASSO", clf)])
pipe1.fit(X_train, y_train)

Pipeline(steps=[('scale', StandardScaler()),
                ('LASSO',
                 LogisticRegression(C=0.5, penalty='l1', solver='liblinear'))])

In [37]:
## Score on training data
pipe1.score(X_train, y_train)

0.844964441664288

In [38]:
## Score on testing data
pipe1.score(X_test, y_test)

0.8413700959553019

In [39]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L1_0.5'] = \
(accuracy_score(y_train, pipe1.predict(X_train)),
accuracy_score(y_test, pipe1.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414


# Selecting Parameters through Cross Validation
C is the inverse of regulatization strength in LASSO: Smaller C means stronger Regularization

In [40]:
## Select the alpha through cross validation (leave one out)
clf = linear_model.LogisticRegressionCV(Cs=[0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 1])
scaler = preprocessing.StandardScaler()
pipe2 = Pipeline([("scale", scaler), ("LASSO", clf)])
pipe2.fit(X_train, y_train)

Pipeline(steps=[('scale', StandardScaler()),
                ('LASSO',
                 LogisticRegressionCV(Cs=[0.01, 0.05, 0.1, 0.15, 0.2, 0.5,
                                          1]))])

In [41]:
pipe2.named_steps['LASSO'].C_

array([0.01])

In [42]:
## Examine the score of the model on training data
pipe2.score(X_train, y_train)

## Score on validation/test data
pipe2.score(X_test, y_test)

0.8449705148215403

0.8413700959553019

In [43]:
## Score the Model on Training and Testing Set
result_scores['Logistic_L1_C'] = \
(accuracy_score(y_train, pipe2.predict(X_train)),
accuracy_score(y_test, pipe2.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414


# Random Forest

In [58]:
#### Fit Random Forest (Scaling is not necessary)

from sklearn import ensemble

from sklearn import ensemble
clf = ensemble.RandomForestClassifier(random_state=42, max_depth=3000)
clf.fit(X_train, y_train)

## get confusion matrix on training data
confusion_matrix(y_train, clf.predict(X_train))
# This produces great predictive model for training data, showing overfitting

RandomForestClassifier(max_depth=3000, random_state=42)

array([[139130,      2],
       [    15,  25512]], dtype=int64)

In [59]:
## get confusion matrix on test data
confusion_matrix(y_test, clf.predict(X_test))

array([[33985,   643],
       [ 6359,   178]], dtype=int64)

In [60]:
## Score the Model on Training and Testing Set
result_scores['RandomForest'] = \
(accuracy_score(y_train, clf.predict(X_train)),
accuracy_score(y_test, clf.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414
RandomForest         0.9999 0.8299


# Random Forest with Parameter Tuning

In [132]:
rf_grid = {"n_estimators": np.arange(10, 200, 10),
           "bootstrap": [True, False],
           "max_depth": [None, 3, 10, 1000, 3000, 5000],
           "min_samples_split": np.arange(2, 1000, 2),
           "min_samples_leaf": np.arange(1, 1000, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [10000]}

# Instantiate RandomizedSearchCV model
rs_model = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1, random_state=42),
                              param_distributions=rf_grid,
                              n_iter=10,
                              cv=5,
                              verbose=True,
                              return_train_score = True,
                              refit=True)
# fit
rs_model.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [None, 3, 10, 1000, 3000,
                                                      5000],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [10000],
                                        'min_samples_leaf': array([  1,   3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  25,
        27,  29,  31,  33,  35,  37,  39,  41,  43,  45,  47,  49,  51,
        53,  55,  57,  59,  61,  63,  65,  67...
       886, 888, 890, 892, 894, 896, 898, 900, 902, 904, 906, 908, 910,
       912, 914, 916, 918, 920, 922, 924, 926, 928, 930, 932, 934, 936,
       938, 940, 942, 944, 946, 948, 950, 952, 954, 956, 958, 960, 962,
       964, 96

In [133]:
rs_model.best_params_

{'n_estimators': 130,
 'min_samples_split': 338,
 'min_samples_leaf': 559,
 'max_samples': 10000,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [134]:
cv_result_score = pd.DataFrame(rs_model.cv_results_)
cv_result_score

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_samples,param_max_features,param_max_depth,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,5.539201,0.13064,0.125647,0.001253,130,338,559,10000,sqrt,10.0,False,"{'n_estimators': 130, 'min_samples_split': 338...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
1,0.747026,0.039232,0.12455,0.001233,110,142,971,10000,1,10.0,True,"{'n_estimators': 110, 'min_samples_split': 142...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
2,0.89454,0.066865,0.125815,0.000945,10,956,507,10000,sqrt,1000.0,False,"{'n_estimators': 10, 'min_samples_split': 956,...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
3,0.690829,0.047724,0.124738,0.000704,100,536,595,10000,1,10.0,True,"{'n_estimators': 100, 'min_samples_split': 536...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
4,0.756272,0.04815,0.124493,0.001238,90,562,419,10000,sqrt,,True,"{'n_estimators': 90, 'min_samples_split': 562,...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
5,2.598834,0.085638,0.124896,0.001133,120,630,893,10000,auto,3.0,False,"{'n_estimators': 120, 'min_samples_split': 630...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
6,1.013862,0.057995,0.125864,0.000747,140,496,689,10000,1,10.0,False,"{'n_estimators': 140, 'min_samples_split': 496...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
7,1.197236,0.055278,0.127267,0.000481,160,648,419,10000,1,5000.0,False,"{'n_estimators': 160, 'min_samples_split': 648...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
8,0.71114,0.036685,0.127261,0.000523,70,248,159,10000,sqrt,,True,"{'n_estimators': 70, 'min_samples_split': 248,...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06
9,2.622917,0.066524,0.125939,0.000839,50,602,693,10000,auto,1000.0,False,"{'n_estimators': 50, 'min_samples_split': 602,...",0.844953,0.844953,0.844984,0.844984,0.844979,0.844971,1.4e-05,1,0.844975,0.844975,0.844967,0.844967,0.844968,0.844971,4e-06


In [135]:
confusion_matrix(y_train, rs_model.predict(X_train))
confusion_matrix(y_test, rs_model.predict(X_test))

array([[139132,      0],
       [ 25527,      0]], dtype=int64)

array([[34628,     0],
       [ 6537,     0]], dtype=int64)

In [136]:
## Score the Model on Training and Testing Set
result_scores['RandomForest_CV'] = \
(accuracy_score(y_train, rs_model.predict(X_train)),
accuracy_score(y_test, rs_model.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414
RandomForest         0.9999 0.8299
RandomForest_CV      0.845  0.8412


# Gradient Boosting Classifier

In [137]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(X_train, y_train)

confusion_matrix(y_train,gbc.predict(X_train))

GradientBoostingClassifier(random_state=0)

array([[139131,      1],
       [ 25505,     22]], dtype=int64)

In [138]:
## Score the Model on Training and Testing Set
result_scores['GBC'] = \
(sklearn.metrics.accuracy_score(y_train, gbc.predict(X_train)),
sklearn.metrics.accuracy_score(y_test, gbc.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414
RandomForest         0.9999 0.8299
RandomForest_CV      0.845  0.8412
GBC                  0.8451 0.8412


# Support Vector Machine Classification

In [139]:
# Linear Kernel
from sklearn.svm import LinearSVC

clf = LinearSVC(loss='hinge', dual=True)
clf.fit(X_train, y_train)


LinearSVC(loss='hinge')

In [140]:
## LASSO regression, set alpha
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

clf = LinearSVC(loss='hinge', dual=True)
scaler = preprocessing.StandardScaler().fit(X_train)
pipe1 = Pipeline([("scale", scaler), ("LASSO", clf)])
pipe1.fit(X_train, y_train)

Pipeline(steps=[('scale', StandardScaler()),
                ('LASSO', LinearSVC(loss='hinge'))])

In [141]:
pipe1.score(X_train, y_train)

## Score on validation/test data
pipe1.score(X_test, y_test)

0.84491585640627

0.8411757561034859

In [142]:
confusion_matrix(y_train,pipe1.predict(X_train))

array([[139104,     28],
       [ 25508,     19]], dtype=int64)

In [143]:
confusion_matrix(y_test,pipe1.predict(X_test))

array([[34623,     5],
       [ 6533,     4]], dtype=int64)

In [144]:
print_score(pipe1, X_train, y_train, X_test, y_test, train=True)
print_score(pipe1, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 84.49%
_______________________________________________
CLASSIFICATION REPORT:
                     0.0           1.0  accuracy      macro avg   weighted avg
precision       0.845042      0.404255  0.844916       0.624648       0.776707
recall          0.999799      0.000744  0.844916       0.500272       0.844916
f1-score        0.915929      0.001486  0.844916       0.458708       0.774164
support    139132.000000  25527.000000  0.844916  164659.000000  164659.000000
_______________________________________________
Confusion Matrix: 
 [[139104     28]
 [ 25508     19]]

Test Result:
Accuracy Score: 84.12%
_______________________________________________
CLASSIFICATION REPORT:
                    0.0          1.0  accuracy     macro avg  weighted avg
precision      0.841263     0.444444  0.841176      0.642853      0.778248
recall         0.999856     0.000612  0.841176      0.500234      0.841176
f1-score       0.913728     0.001222  0.841176      0.457475 

In [145]:
## Score the Model on Training and Testing Set
result_scores['SVC Linear'] = \
(sklearn.metrics.accuracy_score(y_train, pipe1.predict(X_train)),
sklearn.metrics.accuracy_score(y_test, pipe1.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414
RandomForest         0.9999 0.8299
RandomForest_CV      0.845  0.8412
GBC                  0.8451 0.8412
SVC Linear           0.8449 0.8412


# Adding xgboost

In [146]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

xgb = XGBClassifier(n_estimators=100)

training_start = time.perf_counter()
xgb= xgb.fit(X_train, y_train)
training_end = time.perf_counter()

prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()

# Save rounded values in predictions
predictions = [round(value) for value in preds]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
acc_xgb = accuracy*100

#print("Accuracy: %.2f%%" % (accuracy * 100.0))

#acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100

xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start

print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))


XGBoost's prediction accuracy is: 84.09
Time consumed for training: 8.828
Time consumed for prediction: 0.03813 seconds


In [147]:
## Score the Model on Training and Testing Set
result_scores['XG Boost'] = \
(sklearn.metrics.accuracy_score(y_train, xgb.predict(X_train)),
sklearn.metrics.accuracy_score(y_test, xgb.predict(X_test)))

# Store Model Results
get_results(result_scores)


Model                Train Test
-------------------------------------------
Logistic             0.845  0.8413
Logistic_L1          0.845  0.8413
Logistic_L1_0.5      0.845  0.8414
Logistic_L1_C        0.845  0.8414
RandomForest         0.9999 0.8299
RandomForest_CV      0.845  0.8412
GBC                  0.8451 0.8412
SVC Linear           0.8449 0.8412
XG Boost             0.8467 0.8409


In [148]:
# Getting cross validation scores for XGB to see result sensitivity
from sklearn.model_selection import cross_val_score
xgb_cv = XGBClassifier(n_estimators=100)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.84446739 0.84477104 0.84471031 0.84392081 0.84398154 0.84434592
 0.84452812 0.84428519 0.84458885 0.84476162]
Mean: 0.8444360789603762
Standard Deviation: 0.0002879543409145578


# Summary

Overall, the default Random Forest performs the best on training data but not much better performance on test data. It shows an evidence of overfitting. All other classification methods performed in a very similar fashion. More feature engineering such as PCA, Lasso or Ridge regression can be done to improve the model prediction.

# Other methods that I planned to include 

In [None]:
# Polynomial Kernel in SVM
# Took long time to learn
from sklearn.svm import SVC

#The hyperparameter coef0 controls how much the model is influenced by high degree ploynomials 
model = SVC(kernel='poly', degree=2, gamma='auto', coef0=1, C=5)
model.fit(X_train, y_train)

print_score(model, X_train, y_train, X_test, y_test, train=True)
print_score(model, X_train, y_train, X_test, y_test, train=False)

In [None]:
## Score the Model on Training and Testing Set
result_scores['SVC Polynomial'] = \
(sklearn.metrics.accuracy_score(y_train, model.predict(X_train)),
sklearn.metrics.accuracy_score(y_test, model.predict(X_test)))

# Store Model Results
get_results(result_scores)

In [None]:
# Choosing default gamma
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

mod = make_pipeline(StandardScaler(), SVC(gamma='auto'))
mod.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='scale'))])

#scaler = preprocessing.StandardScaler().fit(X_train)

#pipe1 = Pipeline(steps=[('standardscaler', StandardScaler()),
  #              ('svc', SVC(gamma='auto'))])

#pipe1.fit(X_train, y_train)

In [None]:
print_score(mod, X_train, y_train, X_test, y_test, train=True)
print_score(mod, X_train, y_train, X_test, y_test, train=False)

# Further Cross Validation under Random Forest

## This takes a while to run! Wasn't able to run this.

from sklearn.model_selection import GridSearchCV

## Specify grid
parameters = {'n_estimators': [500, 1000],
'max_features': [5,10]}

## Specify model without hyperparameters
rf_model = ensemble.RandomForestClassifier(random_state=32)

## Specify search with model
clf = GridSearchCV(rf_model,
parameters,
cv=5,
return_train_score=True)
clf.fit(X_train,y_train)


# Manual Cross validation in GBM

## Specify grid in GBM

parameters3 = {'n_estimators': (100,500),
'learning_rate':(0.1,0.3)}

## Specify model without hyperparameters
gbc_model = GradientBoostingClassifier()

## Specify search with model
gbc2 = GridSearchCV(gbc_model,
parameters3,
cv=5,
return_train_score=True)

## Now fit the model
gbc2.fit(X_train,y_train)

## Explore best hyperparameters
gbc2.best_params_

## Score the Model on Training and Testing Set
result_scores['GBC2'] = \
(sklearn.metrics.accuracy_score(y_train, gbc2.predict(X_train)),
sklearn.metrics.accuracy_score(y_test, gbc2.predict(X_test)))

## Store Model Results
get_results(result_scores)