In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import itertools
import math
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as accuracy
import graphviz 
from datetime import datetime, timedelta
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn import preprocessing, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import ParameterGrid
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler


# personal modules
import ml_pipeline_lch as ml
import ml_explore as exp
import ml_modeling as md


sns.set(style = "ticks", color_codes = True)
%matplotlib inline 

### Read and Pre-Process Data

#### Read Data

In [2]:
outcomes_df = ml.retrieve_data(filename = 'data/outcomes.csv', headers = True, set_ind = 0)

In [3]:
outcomes_df.head()

Unnamed: 0_level_0,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,great_messages_proportion,teacher_referred_count,non_teacher_referred_count
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ffffc4f85b60efc5b52347df489d0238,f,,f,,f,,,,,,
ffffac55ee02a49d1abc87ba6fc61135,f,f,t,t,f,t,f,f,57.0,0.0,7.0
ffff97ed93720407d70a2787475932b0,f,f,t,t,t,t,t,f,100.0,0.0,3.0
ffff418bb42fad24347527ad96100f81,f,f,f,t,t,f,f,f,100.0,0.0,1.0
ffff2d9c769c8fb5335e949c615425eb,t,t,t,t,t,f,t,f,63.0,6.0,2.0


In [4]:
outcomes_df.replace(to_replace='t', value=1, inplace=True)
outcomes_df.replace(to_replace='f', value=0, inplace=True)

In [5]:
outcomes_df.head()

Unnamed: 0_level_0,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,great_messages_proportion,teacher_referred_count,non_teacher_referred_count
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ffffc4f85b60efc5b52347df489d0238,0,,0,,0,,,,,,
ffffac55ee02a49d1abc87ba6fc61135,0,0.0,1,1.0,0,1.0,0.0,0.0,57.0,0.0,7.0
ffff97ed93720407d70a2787475932b0,0,0.0,1,1.0,1,1.0,1.0,0.0,100.0,0.0,3.0
ffff418bb42fad24347527ad96100f81,0,0.0,0,1.0,1,0.0,0.0,0.0,100.0,0.0,1.0
ffff2d9c769c8fb5335e949c615425eb,1,1.0,1,1.0,1,0.0,1.0,0.0,63.0,6.0,2.0


In [6]:
projects_df = ml.retrieve_data(filename = 'data/projects.csv', headers = True, set_ind = 0)

In [7]:
project_info = pd.merge(projects_df, pd.DataFrame(outcomes_df['fully_funded']), how = 'inner', right_index = True, left_index = True)

In [8]:
project_info.head()

Unnamed: 0_level_0,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,...,poverty_level,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted,fully_funded
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,suburban,Elgin School District U-46,...,moderate poverty,Grades 3-5,30.0,444.36,522.78,7.0,f,f,2013-12-31,1
33d59ac771b80222ad63ef0f4ac47ade,de83b4c1f6428a15032c207c1d5e572a,d91a805b213bf74ae77b94e0de2b73ad,160153000000.0,43.501154,-112.05678,Idaho Falls,ID,83402.0,urban,Idaho Falls School District 91,...,high poverty,Grades 3-5,30.0,233.24,274.4,30.0,f,f,2013-12-31,0
1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,330261000000.0,42.888244,-71.320224,Derry,NH,3038.0,suburban,School Administrative Unit 10,...,moderate poverty,Grades 6-8,30.0,285.09,335.4,230.0,f,f,2013-12-31,0
33aa19ee4da4c5adf47d0dfb84fab5ef,17768031eb40de8d4497dbb54df48742,9ac70da58322783f82152eecc140a812,510324000000.0,37.476158,-77.488397,Richmond,VA,23224.0,urban,Richmond City School District,...,highest poverty,Grades PreK-2,30.0,232.94,274.05,18.0,f,f,2013-12-31,0
e31c0ea8b68f404699dfb0d39e9bc99b,0f1bc5b4700fd33383be104442660178,cb9f688cf59e3ee22a087d616ca8f5d7,170993000000.0,41.952851,-87.650233,Chicago,IL,60613.0,urban,Ravenswood-ridge Elem Network,...,highest poverty,Grades 6-8,30.0,513.41,604.01,70.0,t,f,2013-12-31,1


### Date Transformation and Narrowing

In [9]:
project_info.date_posted.dtype

dtype('O')

In [10]:
project_info['date_posted'] = ml.convert_dates(project_info['date_posted'])

In [11]:
project_info['year'] = project_info['date_posted'].apply(lambda x: x.year)

In [12]:
project_info['year'].unique()

array([2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003,
       2002])

In [13]:
project_info = project_info[(project_info['date_posted'] >= datetime.strptime('2011-01-01', '%Y-%m-%d')) & (project_info['date_posted'] <= datetime.strptime('2013-12-31', '%Y-%m-%d'))]            
                                     

In [14]:
project_info['year'].unique()

array([2013, 2012, 2011])

### Column Details Exploration

In [40]:
project_info.describe().round(3).transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
school_ncesid,328293.0,250312700000.0,164817300000.0,10000500000.0,63723010000.0,240009000000.0,390444000000.0,610000800000.0
school_latitude,353151.0,36.869,4.927,18.249,33.916,36.676,40.68,67.258
school_longitude,353151.0,-95.398,18.118,-171.691,-117.247,-89.886,-80.762,-66.628
school_zip,353151.0,56301.76,31043.52,410.0,29388.0,60613.0,90007.0,99926.0
fulfillment_labor_materials,353151.0,32.22,2.484,30.0,30.0,30.0,35.0,35.0
total_price_excluding_optional_support,353151.0,531.372,844.069,37.03,288.75,427.27,606.72,139725.4
total_price_including_optional_support,353151.0,625.144,993.023,43.56,339.71,502.67,713.79,164382.8
students_reached,353050.0,94.187,156.987,1.0,23.0,31.0,100.0,12143.0
fully_funded,353151.0,0.701,0.458,0.0,0.0,1.0,1.0,1.0
year,353151.0,2012.077,0.813,2011.0,2011.0,2012.0,2013.0,2013.0


In [15]:
def organize_variables(df, col_names, indicator, var_dict = None):
    if var_dict is None:
        var_dict = {'binary': [], 'tops': [], 'drop': [], 'ids': [], 'geo': [], 'multi': [], 'numeric': []}
    
    if indicator == 'binary':
        var_dict[indicator] += col_names
    elif indicator == 'multi':
        var_dict[indicator] += col_names
    elif indicator == 'numeric':
        var_dict[indicator] += col_names
    elif indicator == 'geo':
        var_dict[indicator] += col_names
    elif indicator == 'ids':
        var_dict[indicator] += col_names
    elif indicator == 'tops':
        var_dict[indicator] += col_names
    elif indicator == 'drop':
        var_dict[indicator] += col_names
    
    return var_dict

In [49]:
# check column types
def check_col_types(df):
    return pd.DataFrame(df.dtypes, df.columns).rename({0: 'data_type'}, axis = 1)

In [50]:
# check column types
ml.check_col_types(project_info)

Unnamed: 0,data_type
teacher_acctid,object
schoolid,object
school_ncesid,float64
school_latitude,float64
school_longitude,float64
school_city,object
school_state,object
school_zip,float64
school_metro,object
school_district,object


In [51]:
# view values across columns
def view_cols(df):
    '''
    View unique values across columns in given data frame.
    '''
    for col in df.columns:
        print(col)
        print(df[col].unique())
        print()

In [52]:
ml.view_cols(project_info)

teacher_acctid
['ebc7c90b6c92a069432e0714b8d93dfd' 'de83b4c1f6428a15032c207c1d5e572a'
 'f4c9ed095b85458dcf858e25f203af00' ... '4a7fd60460dccc2a4d09069e53c6b964'
 '8c458627ae58a3057232e8de4f1f51ad' 'e0b03d3b814958dbba3d0236f237aab6']

schoolid
['5aca9711ff0e4b37db48701f46f73036' 'd91a805b213bf74ae77b94e0de2b73ad'
 '9310d3eb447a4e46bc5fc31ed007ceac' ... 'fda28249bfc517eb67f57875063f7c4a'
 '1cac20ff36dab7b47161703fdd0134a5' 'c17558ef1c387875778e241a9054946d']

school_ncesid
[1.71371006e+11 1.60153000e+11 3.30261000e+11 ... 3.41053005e+11
 1.91269001e+11 3.61941004e+11]

school_latitude
[41.972419 43.501154 42.888244 ... 41.287719 41.046086 40.958734]

school_longitude
[ -88.174597 -112.05678   -71.320224 ...  -74.79199   -95.74642
  -72.994883]

school_city
['Bartlett' 'Idaho Falls' 'Derry' ... 'Mead' 'Mahopac' 'Miller Place']

school_state
['IL' 'ID' 'NH' 'VA' 'SC' 'KY' 'FL' 'WY' 'NY' 'NM' 'NC' 'AZ' 'MI' 'MO'
 'TX' 'NJ' 'CO' 'MT' 'CA' 'AK' 'MN' 'CT' 'GA' 'IN' 'MA' 'PA' 'TN' 'OK'
 'LA' 'O

In [60]:
# log ids
type_dict = organize_variables(project_info, col_names = ['teacher_acctid', 'schoolid', 'school_ncesid'], indicator = 'ids')


In [61]:
# log binary variables
type_dict = organize_variables(project_info, col_names = ['school_charter', 'school_magnet', 'school_year_found', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'eligible_double_your_impact_match', 'eligibile_almost_home_match'], indicator = 'binary', var_dict = type_dict)


In [62]:
# log variables to drop
type_dict = organize_variables(project_info, col_names = ['school_latitude', 'school_longitude'], indicator = 'drop', var_dict = type_dict)


In [63]:
# log geographical info to keep
type_dict = organize_variables(project_info, col_names = ['school_city', 'school_state', 'school_zip'], indicator = 'geo', var_dict = type_dict)


In [64]:
# log items with multiple values to convert to dummy variables
type_dict = organize_variables(project_info, col_names = ['school_metro', 'teacher_prefix', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level'], indicator = 'multi', var_dict = type_dict)


In [29]:
# log variables to create dummy variables for solely top-occuring values
type_dict = organize_variables(project_info, col_names = ['school_district', 'school_county', 'school_state', 'school_city', 'primary_focus_subject', 'secondary_focus_subject', 'secondary_focus_area'], indicator = 'tops', var_dict = type_dict)


In [65]:
type_dict

{'binary': ['school_charter',
  'school_magnet',
  'school_year_found',
  'school_nlns',
  'school_kipp',
  'school_charter_ready_promise',
  'teacher_teach_for_america',
  'teacher_ny_teaching_fellow',
  'eligible_double_your_impact_match',
  'eligibile_almost_home_match'],
 'tops': [],
 'drop': ['school_latitude', 'school_longitude'],
 'ids': ['teacher_acctid', 'schoolid', 'school_ncesid'],
 'geo': ['school_city', 'school_state', 'school_zip'],
 'multi': ['school_metro',
  'teacher_prefix',
  'primary_focus_area',
  'resource_type',
  'poverty_level',
  'grade_level'],
 'numeric': []}

### Outlier Evaluation

In [None]:
maxes = ml.view_max_mins(project_info, max = True)
maxes.tail().round(2)

In [None]:
likely_outliers_upper = ml.view_likely_outliers(project_info)
likely_outliers_upper.tail()

In [None]:
mins = ml.view_max_mins(project_info, max = False)
mins

In [None]:
likely_outliers_lower = ml.view_likely_outliers(project_info, max = False)
likely_outliers_lower

#### Create copy of dataframe to manipulate and explore

In [75]:
manip_df = project_info[list(project_info.columns)]

In [76]:
manip_df.head()

Unnamed: 0_level_0,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,...,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted,fully_funded,year
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,suburban,Elgin School District U-46,...,Grades 3-5,30.0,444.36,522.78,7.0,f,f,2013-12-31,1,2013
33d59ac771b80222ad63ef0f4ac47ade,de83b4c1f6428a15032c207c1d5e572a,d91a805b213bf74ae77b94e0de2b73ad,160153000000.0,43.501154,-112.05678,Idaho Falls,ID,83402.0,urban,Idaho Falls School District 91,...,Grades 3-5,30.0,233.24,274.4,30.0,f,f,2013-12-31,0,2013
1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,330261000000.0,42.888244,-71.320224,Derry,NH,3038.0,suburban,School Administrative Unit 10,...,Grades 6-8,30.0,285.09,335.4,230.0,f,f,2013-12-31,0,2013
33aa19ee4da4c5adf47d0dfb84fab5ef,17768031eb40de8d4497dbb54df48742,9ac70da58322783f82152eecc140a812,510324000000.0,37.476158,-77.488397,Richmond,VA,23224.0,urban,Richmond City School District,...,Grades PreK-2,30.0,232.94,274.05,18.0,f,f,2013-12-31,0,2013
e31c0ea8b68f404699dfb0d39e9bc99b,0f1bc5b4700fd33383be104442660178,cb9f688cf59e3ee22a087d616ca8f5d7,170993000000.0,41.952851,-87.650233,Chicago,IL,60613.0,urban,Ravenswood-ridge Elem Network,...,Grades 6-8,30.0,513.41,604.01,70.0,t,f,2013-12-31,1,2013


#### Outlier Removal

In [77]:
# total price and students reached values should not be inf 
ml.remove_over_under_threshold(manip_df, col = 'total_price_excluding_optional_support', min_val = 0, max_val = False, lwr_threshold = 0.001, upr_threshold = False)
ml.remove_over_under_threshold(manip_df, col = 'total_price_including_optional_support', min_val = 0, max_val = False, lwr_threshold = 0.001, upr_threshold = False)
ml.remove_over_under_threshold(manip_df, col = 'students_reached', min_val = 0, max_val = False, lwr_threshold = 0.005, upr_threshold = False)


In [78]:
# check that expected values were removed
likely_outliers_lower_post = ml.view_likely_outliers(manip_df, max = False)
likely_outliers_lower_post

Unnamed: 0,school_ncesid,school_latitude,school_longitude,school_zip,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,fully_funded,year
0.0,,,,,,,,,,
0.001,0.002649951,0.143325,-0.078619,1.726829,0.0,0.011051,0.011057,0.0,,0.0
0.002,0.007479675,0.023015,-0.001138,0.432916,0.0,0.012375,0.012354,0.0,,0.0
0.003,0.007127319,0.001851,-0.000891,0.004994,0.0,0.008899,0.00894,0.2,,0.0
0.004,0.00294871,0.005969,-0.000905,0.142857,0.0,0.008966,0.008984,0.0,,0.0
0.005,0.00323409,0.180625,-0.01669,0.006522,0.0,0.007008,0.007001,0.0,,0.0
0.006,0.002735179,0.007932,-0.034093,0.093952,0.0,0.005237,0.005245,0.0,,0.0
0.007,0.002545706,0.003906,-0.00059,0.045903,0.0,0.005567,0.005521,0.166667,,0.0
0.008,0.004651396,0.001726,-0.170409,0.001416,0.0,0.004614,0.004646,0.0,,0.0
0.009,0.9361638,0.001303,-0.003872,0.002356,0.0,0.004027,0.004024,0.0,,0.0


### Binary and Dummy conversion

In [79]:
# check for null values
ml.print_null_freq(manip_df)

value                                    False   True 
variable                                              
date_posted                             353151       0
eligible_almost_home_match              353151       0
eligible_double_your_impact_match       353151       0
fulfillment_labor_materials             353151       0
fully_funded                            353151       0
grade_level                             353145       6
poverty_level                           353151       0
primary_focus_area                      353116      35
primary_focus_subject                   353116      35
resource_type                           353109      42
school_charter                          353151       0
school_charter_ready_promise            353151       0
school_city                             353151       0
school_county                           353151       0
school_district                         352722     429
school_kipp                             353151       0
school_lat

In [80]:
# convert binary values
manip_df.replace({'t': 1, 'f': 0}, inplace=True)

In [81]:
# confirm change
manip_df.head()

Unnamed: 0_level_0,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,...,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted,fully_funded,year
projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,suburban,Elgin School District U-46,...,Grades 3-5,30.0,444.36,522.78,7.0,0,0,2013-12-31,1,2013
33d59ac771b80222ad63ef0f4ac47ade,de83b4c1f6428a15032c207c1d5e572a,d91a805b213bf74ae77b94e0de2b73ad,160153000000.0,43.501154,-112.05678,Idaho Falls,ID,83402.0,urban,Idaho Falls School District 91,...,Grades 3-5,30.0,233.24,274.4,30.0,0,0,2013-12-31,0,2013
1a3aaeffc56dd2a421e37d8298024c0a,f4c9ed095b85458dcf858e25f203af00,9310d3eb447a4e46bc5fc31ed007ceac,330261000000.0,42.888244,-71.320224,Derry,NH,3038.0,suburban,School Administrative Unit 10,...,Grades 6-8,30.0,285.09,335.4,230.0,0,0,2013-12-31,0,2013
33aa19ee4da4c5adf47d0dfb84fab5ef,17768031eb40de8d4497dbb54df48742,9ac70da58322783f82152eecc140a812,510324000000.0,37.476158,-77.488397,Richmond,VA,23224.0,urban,Richmond City School District,...,Grades PreK-2,30.0,232.94,274.05,18.0,0,0,2013-12-31,0,2013
e31c0ea8b68f404699dfb0d39e9bc99b,0f1bc5b4700fd33383be104442660178,cb9f688cf59e3ee22a087d616ca8f5d7,170993000000.0,41.952851,-87.650233,Chicago,IL,60613.0,urban,Ravenswood-ridge Elem Network,...,Grades 6-8,30.0,513.41,604.01,70.0,1,0,2013-12-31,1,2013


In [82]:
manip_df.columns

Index(['teacher_acctid', 'schoolid', 'school_ncesid', 'school_latitude',
       'school_longitude', 'school_city', 'school_state', 'school_zip',
       'school_metro', 'school_district', 'school_county', 'school_charter',
       'school_magnet', 'school_year_round', 'school_nlns', 'school_kipp',
       'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow',
       'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'fulfillment_labor_materials',
       'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'date_posted', 'fully_funded', 'year'],
      dtype='object')

In [83]:
type_dict['drop']

['school_latitude', 'school_longitude']

In [84]:
type_dict['tops']

[]

In [98]:
type_dict['multi']

['school_metro',
 'teacher_prefix',
 'primary_focus_area',
 'resource_type',
 'poverty_level',
 'grade_level']

In [99]:
def replace_dummies(df, cols_to_dummy):
    df = pd.get_dummies(df, columns = cols_to_dummy, dummy_na=True)


In [100]:
replace_dummies(manip_df, type_dict['multi'])


In [103]:
manip_df = pd.get_dummies(manip_df, columns = type_dict['multi'], dummy_na=True)

In [104]:
manip_df.columns

Index(['teacher_acctid', 'schoolid', 'school_ncesid', 'school_latitude',
       'school_longitude', 'school_city', 'school_state', 'school_zip',
       'school_district', 'school_county', 'school_charter', 'school_magnet',
       'school_year_round', 'school_nlns', 'school_kipp',
       'school_charter_ready_promise', 'teacher_teach_for_america',
       'teacher_ny_teaching_fellow', 'primary_focus_subject',
       'secondary_focus_subject', 'secondary_focus_area',
       'fulfillment_labor_materials', 'total_price_excluding_optional_support',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match',
       'date_posted', 'fully_funded', 'year', 'school_metro_rural',
       'school_metro_suburban', 'school_metro_urban', 'school_metro_nan',
       'teacher_prefix_Dr.', 'teacher_prefix_Mr.', 'teacher_prefix_Mrs.',
       'teacher_prefix_Ms.', 'teacher_prefix_nan',
       'primary_focus_area_Applied Learni

In [None]:
# fill null values for which zero where it is most likely true value
ml.basic_fill_vals(manip_df, "fully_funded", method = "replace", replace_with = 0)


In [None]:
def record_nulls(df):
    for col in list(df.columns):
        title = col + "_was_null"
        df[title] = df[col].isnull().astype(int)
    df = df.loc[:, (df != 0).any(axis=0)]

In [None]:
# record_nulls(manip_df)

In [None]:
manip_df.head()

In [None]:
ml.print_null_freq(manip_df)

In [None]:
exp.view_dist(manip_df, geo_columns= True,
             fig_size = (20, 20),
             labels = ["Feature Distributions", "Feature", "Frequency"])

In [None]:
# # exp.view_dist(manip_df, geo_columns= True,
#              fig_size = (20, 20),
#              labels = ["Feature Distributions", "Feature", "Frequency"])


In [None]:
manip_df.head()

In [None]:
train_yr_qtr, test_yr_qtr = ml.time_series_split(manip_df, date_col = 'date_posted', train_size = 12, test_size = 3, increment = 'month', specify_start = '2011-01-01')

In [None]:
train_15_qtr, test_15_qtr = ml.time_series_split(manip_df, date_col = 'date_posted', train_size = 15, test_size = 3, increment = 'month', specify_start = '2011-01-01')


In [None]:
train_18_qtr, test_18_qtr = ml.time_series_split(manip_df, date_col = 'date_posted', train_size = 18, test_size = 3, increment = 'month', specify_start = '2011-01-01')


In [None]:
train_21_qtr, test_21_qtr = ml.time_series_split(manip_df, date_col = 'date_posted', train_size = 18, test_size = 3, increment = 'month', specify_start = '2011-01-01')


In [None]:
pd.get_dummies(manip_df, prefix="dummy", prefix_sep='_', dummy_na=False, columns=None)

In [None]:
params_test = { 
    'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
    'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10], 'n_jobs': [-1]},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': [None],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
           }

In [63]:
small_grid = { 
    "Random Forest":{'n_estimators': [100, 10000], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs':[-1]},
    "Logistic Regression": { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
    "Naive Bayes" : {},
    "Decision Tree": {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None,'sqrt','log2'],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

In [74]:
for clf_type, params in small_grid.items():
    print(clf_type)
    print(params)
    print()
    for tried in list(itertools.product(params.values())):
        print(tried)
    

Random Forest
{'n_estimators': [100, 10000], 'max_depth': [5, 50], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 10], 'n_jobs': [-1]}

([100, 10000],)
([5, 50],)
(['sqrt', 'log2'],)
([2, 10],)
([-1],)
Logistic Regression
{'penalty': ['l1', 'l2'], 'C': [1e-05, 0.001, 0.1, 1, 10]}

(['l1', 'l2'],)
([1e-05, 0.001, 0.1, 1, 10],)
Naive Bayes
{}

Decision Tree
{'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': [None, 'sqrt', 'log2'], 'min_samples_split': [2, 5, 10]}

(['gini', 'entropy'],)
([1, 5, 10, 20, 50, 100],)
([None, 'sqrt', 'log2'],)
([2, 5, 10],)
SVM
{'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']}

([1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10],)
(['linear'],)
KNN
{'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree']}

([1, 5, 10, 25, 50, 100],)
(['uniform', 'distance'],)
(['auto', 'ball_tree', 'kd_tree'],)


In [114]:
# parameters for looping sourced from example at https://github.com/rayidghani/magicloops/blob/master/mlfunctions.py
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None, 'sqrt','log2'],'min_samples_split': [2,5,10]}
dt_parameter = list(dt_params.keys())
# for params in list(itertools.product(*dt_params.values())):
#     print(params)
list_params = list(itertools.product(*dt_params.values()))

In [115]:
dt_parameter

['criterion', 'max_depth', 'max_features', 'min_samples_split']

In [116]:
list_params

[('gini', 1, None, 2),
 ('gini', 1, None, 5),
 ('gini', 1, None, 10),
 ('gini', 1, 'sqrt', 2),
 ('gini', 1, 'sqrt', 5),
 ('gini', 1, 'sqrt', 10),
 ('gini', 1, 'log2', 2),
 ('gini', 1, 'log2', 5),
 ('gini', 1, 'log2', 10),
 ('gini', 5, None, 2),
 ('gini', 5, None, 5),
 ('gini', 5, None, 10),
 ('gini', 5, 'sqrt', 2),
 ('gini', 5, 'sqrt', 5),
 ('gini', 5, 'sqrt', 10),
 ('gini', 5, 'log2', 2),
 ('gini', 5, 'log2', 5),
 ('gini', 5, 'log2', 10),
 ('gini', 10, None, 2),
 ('gini', 10, None, 5),
 ('gini', 10, None, 10),
 ('gini', 10, 'sqrt', 2),
 ('gini', 10, 'sqrt', 5),
 ('gini', 10, 'sqrt', 10),
 ('gini', 10, 'log2', 2),
 ('gini', 10, 'log2', 5),
 ('gini', 10, 'log2', 10),
 ('gini', 20, None, 2),
 ('gini', 20, None, 5),
 ('gini', 20, None, 10),
 ('gini', 20, 'sqrt', 2),
 ('gini', 20, 'sqrt', 5),
 ('gini', 20, 'sqrt', 10),
 ('gini', 20, 'log2', 2),
 ('gini', 20, 'log2', 5),
 ('gini', 20, 'log2', 10),
 ('gini', 50, None, 2),
 ('gini', 50, None, 5),
 ('gini', 50, None, 10),
 ('gini', 50, 'sqrt',

In [117]:
all_dicts = []
for params in list_params:
    kwargs_dict = dict(zip(dt_parameter, params))
    all_dicts.append(kwargs_dict)


In [120]:
all_dicts

[{'criterion': 'gini',
  'max_depth': 1,
  'max_features': None,
  'min_samples_split': 2},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': None,
  'min_samples_split': 5},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': None,
  'min_samples_split': 10},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'sqrt',
  'min_samples_split': 2},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'sqrt',
  'min_samples_split': 5},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'sqrt',
  'min_samples_split': 10},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'log2',
  'min_samples_split': 2},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'log2',
  'min_samples_split': 5},
 {'criterion': 'gini',
  'max_depth': 1,
  'max_features': 'log2',
  'min_samples_split': 10},
 {'criterion': 'gini',
  'max_depth': 5,
  'max_features': None,
  'min_samples_split': 2},
 {'criterion': 'gini',
  'max_depth': 5,
  'max_features': None,


In [131]:
args = all_dicts[0]
dec_tree = DecisionTreeClassifier(**args)


In [151]:
x = {'A': [1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0], 'B': ['pink', 'blue','pink', 'blue', 'pink', 'pink', 'blue', 'blue',  'blue','pink', 'blue', 'pink', 'blue','pink']}

In [152]:
y = {'C': [1, 9, 1, 0, 1, 4, 0, 6, 1, 7, 1, 1, 3, 1]}

In [153]:
features = pd.DataFrame(x)
features

Unnamed: 0,A,B
0,1,pink
1,0,blue
2,1,pink
3,0,blue
4,1,pink
5,1,pink
6,0,blue
7,1,blue
8,0,blue
9,1,pink


In [154]:
features = pd.get_dummies(features, 'B')
features

Unnamed: 0,A,B_blue,B_pink
0,1,0,1
1,0,1,0
2,1,0,1
3,0,1,0
4,1,0,1
5,1,0,1
6,0,1,0
7,1,1,0
8,0,1,0
9,1,0,1


In [155]:
ys = pd.DataFrame(y)
ys

Unnamed: 0,C
0,1
1,9
2,1
3,0
4,1
5,4
6,0
7,6
8,1
9,7


In [156]:
x_train, x_test, y_train, y_test = train_test_split(features, ys, test_size = 0.2, random_state = 0)

In [157]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(11, 3)
(11, 1)
(3, 3)
(3, 1)


In [158]:
dec_tree.fit(x_train, y_train)
train_pred = dec_tree.predict(x_train)
test_pred = dec_tree.predict(x_test)
train_acc = accuracy(x_train, y_train)
test_acc = accuracy(x_test, y_test)
print(train_acc, test_acc)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and multiclass targets