In [35]:
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arff
from operator import itemgetter
from scipy.stats import randint as sp_randint
from time import time
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import linear_model
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from IPython.core.display import Image

import pydot

from plot_learning_curve import plot_learning_curve

In [36]:
%%time
gender = {'Male' : 0, 'Female' : 1, 'Unknown' : 2}
language = {'English' : 0, 'EnglishandAnother' : 1, 'Another' : 2}

trainfile = 'sample_data\\wekamissing_data.arff'

x_train = []
y_train = []

with open(trainfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa and college credits
    features = [arffFile['attributes'][0:-1][i][0].encode("ascii") for i in range(len(arffFile['attributes'][0:-1]))]
    data = arffFile['data']
    for row in data:
        row[0] = gender[row[0]] # index of gender
        row[12] = language[row[12]] # index of language
        x_train.append(row[0:-1]) # do not include number of credits taken first year
        y_train.append(row[-1])
        
print 'Number of examples:', len(x_train)

Number of examples: 3002
Wall time: 124 ms


# Cross Validation

In [37]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
scores = cross_val_score(rfc, x_train, y_train, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.34 (+/- 0.04)
Wall time: 38.2 s


# No CV Eval on Test Set

In [38]:
%%time
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=0)

Wall time: 0 ns


In [39]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(x_train, y_train)
print rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Wall time: 7.87 s


In [40]:
%%time
def calcError(model, X, Y, printWrong=False):
    numCorrect = 0
    numIncorrect = 0
    incorrectList = []
    testCount = len(X)
    for index in range(testCount):
        x = x_test[index]
        prediction = model.predict(x)
        if prediction == Y[index]:
            numCorrect += 1
        else:
            numIncorrect += 1
            #incorrectList.append((x_testSubnum[index], prediction, y_test[index]))
    print 'Correctly Classified:', numCorrect
    print 'Incorrectly Classified:', numIncorrect
    print 'Correct: %f' % (float(numCorrect) / float(testCount))
    print 'Error: %f' % (1 - float(numCorrect) / float(testCount))
    if printWrong:
        print '====Incorrect Subjects===='
        # Print incorrect examples
        for x in incorrectList:
            print 'Incorrect Sub. %d, Prediction: %s, Correct Label: %s' % (x)
        print '=========================='

calcError(rfc, x_test, y_test)

Correctly Classified: 105
Incorrectly Classified: 196
Correct: 0.348837
Error: 0.651163
Wall time: 12.4 s


# Grid Search

In [41]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [42]:
%%time
clf = RandomForestClassifier(n_estimators=20)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

Wall time: 0 ns


In [43]:
start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

RandomizedSearchCV took 5.26 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.336 (std: 0.008)
Parameters: {'bootstrap': True, 'min_samples_leaf': 8, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 9, 'max_depth': 3}

Model with rank: 2
Mean validation score: 0.336 (std: 0.011)
Parameters: {'bootstrap': False, 'min_samples_leaf': 6, 'min_samples_split': 3, 'criterion': 'gini', 'max_features': 10, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.335 (std: 0.001)
Parameters: {'bootstrap': False, 'min_samples_leaf': 9, 'min_samples_split': 1, 'criterion': 'entropy', 'max_features': 7, 'max_depth': None}

GridSearchCV took 74.56 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.341 (std: 0.009)
Parameters: {'bootstrap': True, 'min_samples_leaf': 10, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.339 