# Homework 3
## Prediction of non funded projects using cross validation
Machine Learning for Public Policy

Camilo Arias

- load, clean and transform functions in pipeline.py

- classifiers in classifiers.py

- Functions to run cross validation in prediction.py

In [49]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pipeline as ppln
import classifiers as classif
import prediction
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parameters of the model

In [54]:
seed = 1234
params = {
    'days': 60,
    'test_size': 0.3,
    'work_with_sample': 0.1,
    'thresholds': [0.5, 0.3, 0.2, 0.1, 0.05, 0.02, 0.01],
    'svm_scores': [-1, -0.8, -0.6, -0.4, -0.3, -0.1, 0],
    'models_to_run': ['KNN', 'decision_tree', 'logistic_reg', 'svm', 'random_forest', 'gradient_boost'],
    'KNN': {'k': 10,
            'weights': 'uniform',
            'metric': 'euclidean'},

    'decision_tree': {'criterion': 'gini',
                      'max_depth': 25},

    'logistic_reg': {'C': 1,
                     'penalty': 'l2',
                     'fit_intercept': True,
                     'seed': seed},

    'svm': {'C': 1,
            'seed': seed},
    'random_forest': {'criterion': 'gini',
                      'max_depth': 15,
                      'n_estimators': 100,
                      'seed': seed},

    'gradient_boost': {'max_depth': 15,
                       'n_estimators': 100,
                       'loss': 'deviance',
                       'seed': seed},
    'out_csv': 'results1.csv'
}
outcome_var = "not_funded_in_{}_days".format(params['days'])

## Loading data and cleaning

In [61]:
projects_df = ppln.load_from_csv('projects_2012_2013.csv')
projects_df = ppln.create_outcome_var(projects_df, params['days'])
initial_length = projects_df.shape[1]
to_discrete = ['total_price_including_optional_support', 'students_reached']
to_dummy = ['school_state', 'school_metro', 'teacher_prefix', 'primary_focus_subject',
            'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area',
            'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support',
            'students_reached']
for col in to_discrete:
    projects_df[col] = ppln.discretize(projects_df[col], 4, string=True)
projects_df = ppln.make_dummies_from_categorical(projects_df, to_dummy)
projects_df['semester'], semesters = ppln.set_semester(projects_df['date_posted'])


### To run model only using a sample of samplesize: params['work_with_sample']

In [64]:
if params['work_with_sample']:
    projects_df = projects_df.sample(frac=params['work_with_sample'],
                                     random_state=seed)
projects_df.shape                                                      

(124976, 166)


(12498, 166)

### Setting X and Y

In [65]:
features = list(projects_df.columns[initial_length - len(to_dummy):]) #Get only new dummies
features += ['school_charter', 'school_magnet', 'eligible_double_your_impact_match']
y = projects_df[outcome_var]
x = projects_df[features]

In [82]:
dict_results = prediction.run_(x, y, projects_df['semester'], params)

Begining cross k: 1
Train set has 2616 rows, with semester values of [0]
Test set has 3256 rows, with semester values of [1]

Fitting KNN

Classifying model KNN with threshold 0.5
Classifying model KNN with threshold 0.3
Classifying model KNN with threshold 0.2
Classifying model KNN with threshold 0.1
Classifying model KNN with threshold 0.05
Classifying model KNN with threshold 0.02
Classifying model KNN with threshold 0.01

Fitting decision_tree

Classifying model decision_tree with threshold 0.5
Classifying model decision_tree with threshold 0.3
Classifying model decision_tree with threshold 0.2
Classifying model decision_tree with threshold 0.1
Classifying model decision_tree with threshold 0.05
Classifying model decision_tree with threshold 0.02
Classifying model decision_tree with threshold 0.01

Fitting logistic_reg

Classifying model logistic_reg with threshold 0.5
Classifying model logistic_reg with threshold 0.3
Classifying model logistic_reg with threshold 0.2
Classifying mo

  'precision', 'predicted', average, warn_for)


Classifying model svm with threshold -1
Classifying model svm with threshold -0.8
Classifying model svm with threshold -0.6
Classifying model svm with threshold -0.4
Classifying model svm with threshold -0.3
Classifying model svm with threshold -0.1
Classifying model svm with threshold 0

Fitting random_forest

Classifying model random_forest with threshold 0.5
Classifying model random_forest with threshold 0.3
Classifying model random_forest with threshold 0.2
Classifying model random_forest with threshold 0.1
Classifying model random_forest with threshold 0.05
Classifying model random_forest with threshold 0.02
Classifying model random_forest with threshold 0.01

Fitting gradient_boost



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Classifying model gradient_boost with threshold 0.5
Classifying model gradient_boost with threshold 0.3
Classifying model gradient_boost with threshold 0.2
Classifying model gradient_boost with threshold 0.1
Classifying model gradient_boost with threshold 0.05
Classifying model gradient_boost with threshold 0.02
Classifying model gradient_boost with threshold 0.01
Begining cross k: 2
Train set has 5872 rows, with semester values of [0 1]
Test set has 2229 rows, with semester values of [2]

Fitting KNN

Classifying model KNN with threshold 0.5
Classifying model KNN with threshold 0.3
Classifying model KNN with threshold 0.2
Classifying model KNN with threshold 0.1
Classifying model KNN with threshold 0.05
Classifying model KNN with threshold 0.02
Classifying model KNN with threshold 0.01

Fitting decision_tree

Classifying model decision_tree with threshold 0.5
Classifying model decision_tree with threshold 0.3
Classifying model decision_tree with threshold 0.2
Classifying model decisio

  'precision', 'predicted', average, warn_for)


Classifying model svm with threshold -1
Classifying model svm with threshold -0.8
Classifying model svm with threshold -0.6
Classifying model svm with threshold -0.4
Classifying model svm with threshold -0.3
Classifying model svm with threshold -0.1
Classifying model svm with threshold 0

Fitting random_forest

Classifying model random_forest with threshold 0.5
Classifying model random_forest with threshold 0.3
Classifying model random_forest with threshold 0.2
Classifying model random_forest with threshold 0.1
Classifying model random_forest with threshold 0.05
Classifying model random_forest with threshold 0.02
Classifying model random_forest with threshold 0.01

Fitting gradient_boost



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Classifying model gradient_boost with threshold 0.5
Classifying model gradient_boost with threshold 0.3
Classifying model gradient_boost with threshold 0.2
Classifying model gradient_boost with threshold 0.1
Classifying model gradient_boost with threshold 0.05
Classifying model gradient_boost with threshold 0.02
Classifying model gradient_boost with threshold 0.01
Begining cross k: 3
Train set has 8101 rows, with semester values of [0 1 2]
Test set has 4397 rows, with semester values of [3]

Fitting KNN

Classifying model KNN with threshold 0.5
Classifying model KNN with threshold 0.3
Classifying model KNN with threshold 0.2
Classifying model KNN with threshold 0.1
Classifying model KNN with threshold 0.05
Classifying model KNN with threshold 0.02
Classifying model KNN with threshold 0.01

Fitting decision_tree

Classifying model decision_tree with threshold 0.5
Classifying model decision_tree with threshold 0.3
Classifying model decision_tree with threshold 0.2
Classifying model decis

  'precision', 'predicted', average, warn_for)


Classifying model svm with threshold -1
Classifying model svm with threshold -0.8
Classifying model svm with threshold -0.6
Classifying model svm with threshold -0.4
Classifying model svm with threshold -0.3
Classifying model svm with threshold -0.1
Classifying model svm with threshold 0

Fitting random_forest

Classifying model random_forest with threshold 0.5
Classifying model random_forest with threshold 0.3
Classifying model random_forest with threshold 0.2
Classifying model random_forest with threshold 0.1
Classifying model random_forest with threshold 0.05
Classifying model random_forest with threshold 0.02
Classifying model random_forest with threshold 0.01

Fitting gradient_boost



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Classifying model gradient_boost with threshold 0.5
Classifying model gradient_boost with threshold 0.3
Classifying model gradient_boost with threshold 0.2
Classifying model gradient_boost with threshold 0.1
Classifying model gradient_boost with threshold 0.05
Classifying model gradient_boost with threshold 0.02
Classifying model gradient_boost with threshold 0.01


In [83]:
results = pd.DataFrame(dict_results)

In [84]:
results.head(30)

Unnamed: 0,model,cross_k,threshold,precision,recall,AUC ROC
0,KNN,1,0.5,0.325,0.273854,0.578358
1,KNN,1,0.3,0.2795,0.692689,0.578358
2,KNN,1,0.2,0.263781,0.883519,0.578358
3,KNN,1,0.1,0.251364,0.97026,0.578358
4,KNN,1,0.05,0.251364,0.97026,0.578358
5,KNN,1,0.02,0.251364,0.97026,0.578358
6,KNN,1,0.01,0.251364,0.97026,0.578358
7,decision_tree,1,0.5,0.303621,0.405204,0.556766
8,decision_tree,1,0.3,0.305842,0.44114,0.556766
9,decision_tree,1,0.2,0.307432,0.451053,0.556766


In [91]:
multi_index = results.sort_values(['model', 'cross_k', 'threshold']).set_index(['model', 'cross_k', 'AUC ROC', 'threshold'])

In [92]:
with pd.option_context("display.max_rows", 200, "display.max_columns", 10):
    print(multi_index)

                                           precision    recall
model          cross_k AUC ROC  threshold                     
KNN            1       0.578358  0.01       0.251364  0.970260
                                 0.02       0.251364  0.970260
                                 0.05       0.251364  0.970260
                                 0.10       0.251364  0.970260
                                 0.20       0.263781  0.883519
                                 0.30       0.279500  0.692689
                                 0.50       0.325000  0.273854
               2       0.582469  0.01       0.317336  0.941860
                                 0.02       0.317336  0.941860
                                 0.05       0.317336  0.941860
                                 0.10       0.317336  0.941860
                                 0.20       0.336437  0.787791
                                 0.30       0.359316  0.549419
                                 0.50       0.461538  0