## Random Forests

In [1]:
# Setup Libraries if needed
import pandas as pd
import sklearn
import numpy as np
import matplotlib as mpl
%matplotlib inline
mpl.pyplot.style.use('ggplot')

In [2]:
# import the training data
training = pd.read_csv('training.csv', index_col='idx')

# view the data
training.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30to59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60to89DaysPastDueNotWorse,NumberOfDependents
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.766127,45,2,0.802982,9120,13,0,6,0,2
2,0,0.957151,40,0,0.121876,2600,4,0,0,0,1
3,0,0.65818,38,1,0.085113,3042,2,1,0,0,0
4,0,0.23381,30,0,0.03605,3300,5,0,0,0,0
5,0,0.907239,49,1,0.024926,63588,7,0,1,0,0


In [3]:
# Import SciKit Learn functions
from sklearn.metrics import roc_curve, auc , roc_auc_score, confusion_matrix, mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import grid_search

In [4]:
# Split the target from the input variables
X = training.iloc[:, 1:]
y = training.iloc[:, 0]

# Now, split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [None]:
# Create Random Forest Object and fit
rf = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=4, 
                            oob_score=True)
rf.fit(X,y)

In [5]:
from datetime import datetime
startTime = datetime.now()

from sklearn import grid_search
# list my parameters
parameters = { 
    'n_estimators': [200, 700,1000,1500,2000,5000],
    'max_features': ['auto', 'log2'],
    }

# Create my tree with grid search
clftree = grid_search.GridSearchCV(RandomForestClassifier(criterion='gini', oob_score=True), parameters, n_jobs=4,
                                   scoring='roc_auc',refit=True)

clftree.fit(X_train, y_train)
tree_model = clftree.best_estimator_
print (clftree.best_score_, clftree.best_params_) 

print datetime.now() - startTime

(0.84919080913550915, {'max_features': 'auto', 'n_estimators': 2000})
0:36:35.760592


In [None]:
# Fit the optimal configuration to the training set and then 
# Look at the accuracy metrics

theForest = RandomForestClassifier(criterion='gini', oob_score=True ,n_estimators=2000,max_features='auto',
                                  n_jobs=3) 

theForest.fit(X_train,y_train)

# Fit the decision tree classifier
theForest.predict(X_test)

tree_preds = theForest.predict_proba(X_test)[:, 1]

# Look at the accuracy metrics
print "The area under the curve is", roc_auc_score(y_test,tree_preds)


In [None]:
print theForest.oob_score_ 
print theForest.feature_importances_