In [1]:
import pipeline
from model import *
from extract_data import *
from config import *
import warnings
warnings.simplefilter('ignore')

In [2]:
# Read the config file
CONFIG = read_config('./config.yaml')

## 1. Read/Preprocessing Data

The process of reading and preprocessing the data has largely been automated. The process can be replicated by:

    $ python3 preprocessing.py
    
And you should see something like:
    

    The data is successfully loaded!

    ################################################################

    Summary for the loaded dataset

    Data Shape: (124976, 25)

    Descritive Statistics:

           school_ncesid  school_latitude  school_longitude  total_price_including_optional_support  students_reached
    count   1.157430e+05    124976.000000     124976.000000                           124976.000000     124917.000000
    mean    2.448448e+11        36.827284        -95.859299                              654.011811         95.445760
    std     1.644728e+11         4.963669         18.392876                             1098.015854        163.481912
    min     1.000050e+10        18.249140       -171.690554                               92.000000          1.000000
    25%     6.344101e+10        33.872504       -117.806418                              345.810000         23.000000
    50%     2.200870e+11        36.617410        -90.101563                              510.500000         30.000000
    75%     3.704880e+11        40.676156        -80.713740                              752.960000        100.000000
    max     6.100010e+11        65.672562        -66.628036                           164382.840000      12143.000000

    ################################################################

    ATTEMPTING TO PREPROCESS THE DATA

    The following features have been selected:
    ['school_latitude', 'school_longitude', 'school_metro', 'school_charter', 'school_magnet', 'primary_focus_area', 'poverty_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match', 'date_posted', 'datefullyfunded']

    The label is appended as fullyfunded_within_60days.
    Dummy variables are created based on ['school_metro', 'school_charter', 'school_magnet', 'primary_focus_area', 'poverty_level', 'eligible_double_your_impact_match'].
                      NaN  Percent of NaN
    students_reached   59            0.05

    ################################################################

    The following features have missing values:
    ['students_reached']

    - students_reached has 59 missing values, which are 0.05% of the entire data
    For students_reached, median is selected.

    Imputation completed!
    The cleaned data is saved as ./data/clean_projects_2012_2013.csv.


In [3]:
df = read_data('./data/clean_projects_2012_2013.csv')

In [6]:
summarize_data(df)

################################################################

Summary for the loaded dataset

Data Shape: (124976, 26)

Descritive Statistics:

       school_latitude  school_longitude  \
count  124976.000000    124976.000000      
mean   36.827284       -95.859299          
std    4.963669         18.392876          
min    18.249140       -171.690554         
25%    33.872504       -117.806418         
50%    36.617410       -90.101563          
75%    40.676156       -80.713740          
max    65.672562       -66.628036          

       total_price_including_optional_support  students_reached  \
count  124976.000000                           124976.000000      
mean   654.011811                              95.414864          
std    1098.015854                             163.449500         
min    92.000000                               1.000000           
25%    345.810000                              23.000000          
50%    510.500000                              30.000

## 2. Train / Evaluate Models

We will be using various classifiers: __*Logistic Regression, K-Nearest Neighbor, Decision Trees, SVM, Random Forests, Boosting, and Bagging*__.

Furthermore, we will be using temporal validation to create three sets of training and testing data. For the purpose of this analysis, all testing data will be 6 months long, each from '2012-07-01', '2013-01-01', '2013-07-01', respectively.

We will first split the data temperally.

In [7]:
temporal_sets = temporal_loop(df, 'date_posted', ['2012-07-01', '2013-01-01', '2013-07-01'], 6)

__*evaluation_table()*__ allows us to get scores for evaluation metrics including accuracy, precision at different levels, recall at different levels, F1, area under curve. Specifically, it will display precision, recall scores at  1%, 2%, 5%, 10%, 20%, 30%, 50%.

By setting evaluation_table(temporal_sets, grid=True), we can customize different parameters set in config.yaml for each classifier. For the purpose of demonstration, we will run default parameters for all supported classifiers.

In [8]:
full_results = evaluation_table(temporal_sets)

In [9]:
full_results.head()

Unnamed: 0,Date,Model,Parameters,Accuracy,F1,AUC_ROC,Precision_at_1%,Recall_at_1%,Precision_at_2%,Recall_at_2%,Precision_at_5%,Recall_at_5%,Precision_at_10%,Recall_at_10%,Precision_at_20%,Recall_at_20%,Precision_at_30%,Recall_at_30%,Precision_at_50%,Recall_at_50%
0,2012-07-01,LR,Default,0.737956,0.847161,0.623556,0.887195,0.0119257,0.893293,0.0240154,0.893358,0.0600795,0.875723,0.117823,0.855946,0.230359,0.840016,0.339125,0.81022,0.545183
1,2012-07-01,KNN,Default,0.669986,0.783826,0.576162,0.871951,0.0117208,0.859756,0.0231138,0.845216,0.0568419,0.831252,0.11184,0.823359,0.221589,0.800832,0.323306,0.785919,0.528831
2,2012-07-01,DT,Default,0.621475,0.733388,0.546529,0.759146,0.0102045,0.753049,0.0202451,0.759293,0.0510635,0.758148,0.102004,0.762905,0.205319,0.764592,0.308676,0.768317,0.516987
3,2012-07-01,SVM,Default,0.743072,0.8526,0.396636,0.530488,0.00713086,0.585366,0.0157371,0.636807,0.0428261,0.652757,0.0878243,0.653114,0.175771,0.664197,0.268145,0.684695,0.460719
4,2012-07-01,RF,Default,0.699099,0.810487,0.606221,0.853659,0.0114749,0.864329,0.0232368,0.86167,0.0579484,0.865062,0.116389,0.840871,0.226302,0.829357,0.334822,0.801815,0.539527


In [10]:
full_results.sort_values(by=['F1'], ascending=False)

Unnamed: 0,Date,Model,Parameters,Accuracy,F1,AUC_ROC,Precision_at_1%,Recall_at_1%,Precision_at_2%,Recall_at_2%,Precision_at_5%,Recall_at_5%,Precision_at_10%,Recall_at_10%,Precision_at_20%,Recall_at_20%,Precision_at_30%,Recall_at_30%,Precision_at_50%,Recall_at_50%
3,2012-07-01,SVM,Default,0.743072,0.8526,0.396636,0.530488,0.00713086,0.585366,0.0157371,0.636807,0.0428261,0.652757,0.0878243,0.653114,0.175771,0.664197,0.268145,0.684695,0.460719
0,2012-07-01,LR,Default,0.737956,0.847161,0.623556,0.887195,0.0119257,0.893293,0.0240154,0.893358,0.0600795,0.875723,0.117823,0.855946,0.230359,0.840016,0.339125,0.81022,0.545183
5,2012-07-01,GB,Default,0.725349,0.831851,0.66585,0.987805,0.0132781,0.972561,0.0261465,0.960999,0.0646285,0.940299,0.126511,0.90437,0.243392,0.88123,0.355764,0.836714,0.56301
14,2013-07-01,LR,Default,0.71198,0.829021,0.641628,0.945578,0.0131983,0.92299,0.0257952,0.899004,0.0628264,0.875453,0.122361,0.847843,0.237031,0.830264,0.348188,0.802654,0.561006
19,2013-07-01,GB,Default,0.716236,0.827623,0.689562,0.986395,0.013768,0.978482,0.0273461,0.965127,0.0674474,0.943388,0.131856,0.908638,0.254028,0.881358,0.369615,0.828465,0.579047
17,2013-07-01,SVM,Default,0.71044,0.826149,0.669144,0.941043,0.013135,0.944507,0.0263966,0.934783,0.0653268,0.914855,0.127868,0.881694,0.246495,0.863472,0.362114,0.819363,0.572686
12,2013-01-01,GB,Default,0.690804,0.811735,0.690484,0.962791,0.0139988,0.969838,0.0282681,0.953661,0.0695882,0.91937,0.134172,0.883947,0.258065,0.857143,0.37533,0.806245,0.588422
7,2013-01-01,LR,Default,0.685059,0.811731,0.63118,0.930233,0.0135254,0.914153,0.026645,0.877665,0.0640427,0.855422,0.124839,0.818624,0.238994,0.792896,0.347197,0.765289,0.558531
4,2012-07-01,RF,Default,0.699099,0.810487,0.606221,0.853659,0.0114749,0.864329,0.0232368,0.86167,0.0579484,0.865062,0.116389,0.840871,0.226302,0.829357,0.334822,0.801815,0.539527
6,2012-07-01,BG,Default,0.699494,0.80952,0.607188,0.841463,0.011311,0.86128,0.0231548,0.86167,0.0579484,0.86293,0.116102,0.84011,0.226097,0.827733,0.334167,0.80151,0.539322


By changing *models* in config.yaml, we can choose a specific classifier model and experiment with different parameters.

In [11]:
test_results = evaluation_table(temporal_sets)

In [12]:
test_results.head()

Unnamed: 0,Date,Model,Parameters,Accuracy,F1,AUC_ROC,Precision_at_1%,Recall_at_1%,Precision_at_2%,Recall_at_2%,Precision_at_5%,Recall_at_5%,Precision_at_10%,Recall_at_10%,Precision_at_20%,Recall_at_20%,Precision_at_30%,Recall_at_30%,Precision_at_50%,Recall_at_50%
0,2012-07-01,LR,Default,0.737956,0.847161,0.623556,0.887195,0.0119257,0.893293,0.0240154,0.893358,0.0600795,0.875723,0.117823,0.855946,0.230359,0.840016,0.339125,0.81022,0.545183
1,2012-07-01,KNN,Default,0.669986,0.783826,0.576162,0.871951,0.0117208,0.859756,0.0231138,0.845216,0.0568419,0.831252,0.11184,0.823359,0.221589,0.800832,0.323306,0.785919,0.528831
2,2012-07-01,DT,Default,0.620592,0.732393,0.546549,0.737805,0.00991763,0.751524,0.0202041,0.756856,0.0508996,0.755407,0.101635,0.76123,0.204869,0.762562,0.307856,0.767343,0.516331
3,2012-07-01,SVM,Default,0.725958,0.838342,0.601623,0.92378,0.0124175,0.907012,0.0243842,0.899452,0.0604893,0.877551,0.118069,0.853205,0.229622,0.83626,0.337609,0.80218,0.539773
4,2012-07-01,RF,Default,0.702479,0.812512,0.61121,0.847561,0.011393,0.853659,0.0229499,0.853138,0.0573747,0.860493,0.115774,0.840871,0.226302,0.831591,0.335724,0.805043,0.541699
