In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import warnings
from dateutil import parser

import pipeline as pp

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 999

In [2]:
# Read data
file = 'data/projects_2012_2013.csv'
coltypes = {'school_ncesid': str}
parse_dates = ['date_posted', 'datefullyfunded']
df = pp.read_csv(file, coltypes=coltypes, parse_dates=parse_dates)

In [3]:
# Data preparation

## Generate outcome variable

# Calculate the number of days it takes for a project to get fully funded
df['time_till_funded'] = (df.datefullyfunded - df.date_posted).apply(lambda x: x.days)
# Create target variable to identify projects fully funded within 60 days of their posting dates
df['not_funded_wi_60d'] = np.where(df.time_till_funded > 60, 1, 0)

In [12]:
# Construct pipeline
pipeline = pp.Pipeline()

In [13]:
# Set pipeline parameters

label = 'not_funded_wi_60d'
predictor_sets = [['school_city', 'school_state',
       'school_metro', 'school_district', 'school_county', 'school_charter',
       'school_magnet', 'teacher_prefix', 'primary_focus_subject',
       'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area',
       'resource_type', 'poverty_level', 'grade_level',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match']]
time_col = 'date_posted'
start = parser.parse('2012-01-01')
end = parser.parse('2013-12-31')
test_window_months = 6
outcome_lag_days = 60
output_dir = 'output'
output_filename = 'evaluations.csv'
grid_size='test'

In [None]:
pipeline.run(df, time_col, predictor_sets, label, start, end, test_window_months, 
            outcome_lag_days, output_dir, output_filename, grid_size='test', thresholds=[], 
            ks=list(np.arange(0, 1, 0.1)), save_output=True, debug=True)

START
set up done. output: output/evaluations.csv
## TRAIN: 2012-01-01 00:00:00 - 2012-06-30 00:00:00, TEST:2012-08-30 00:00:00 - 2013-02-27 00:00:00 ##
### Predictors: ['school_city', 'school_state', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet', 'teacher_prefix', 'primary_focus_subject', 'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match']
...train test split done
...pre-processing done
...feature generation done
#### RF
	{'max_depth': 1, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 1, 'n_jobs': -1}
set params
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
rf.set_params(**{'max_depth': 1, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 1, 'n_jobs': -1})

In [14]:
pipeline.clfs

{'RF': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 'ET': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0, warm_start=False),
 'AB': AdaBoostClassifier(algorithm='SAMME',
           base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
