In [10]:
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arff
from operator import itemgetter
from scipy.stats import randint as sp_randint
from time import time
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import linear_model
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from IPython.core.display import Image

import pydot

from plot_learning_curve import plot_learning_curve

# Load Training Set

In [11]:
%%time
gender = {'Male' : 0, 'Female' : 1, 'Unknown' : 2}
language = {'English' : 0, 'EnglishandAnother' : 1, 'Another' : 2}

trainfile = 'sample_data\\train_subj_norm.arff'

x_train = []
x_trainSubnum = []
features = []
y_train = []

with open(trainfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    features = [arffFile['attributes'][1:-3][i][0].encode("ascii") for i in range(len(arffFile['attributes'][1:-3]))]
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_trainSubnum.append(row[0]) # sepporate subject number
        x_train.append(row[1:-3]) # do not include number of credits taken first year
        y_train.append(row[-1])
        
print 'Number of examples:', len(x_train)
print 'Features: ', features

Number of examples: 2709
Features:  ['gender', 'Firgen', 'famincome', 'SATCRDG', 'SATMATH', 'SATWRTG', 'SATTotal', 'HSGPA', 'ACTRead', 'ACTMath', 'ACTEngWrit', 'APIScore', 'FirstLang', 'HSGPAunweighted']
Wall time: 106 ms


# 10-Fold Cross Validation

In [12]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
scores = cross_val_score(rfc, x_train, y_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.81 (+/- 0.04)
Wall time: 32.3 s


# Fit RandomForests

In [13]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(x_train, y_train)
print rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Wall time: 3.36 s


# Evaluate on Test Set

In [14]:
%%time
testfile = 'sample_data\\test_subj_norm.arff'

x_test = []
x_testSubnum = []
y_test = []

with open(testfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_testSubnum.append(row[0]) # sepporate subject number
        x_test.append(row[1:-3]) # do not include number of credits taken first year
        y_test.append(row[-1])
        
print 'Number of examples:', len(x_test)

Number of examples: 46
Wall time: 4 ms


In [35]:
%%time
def calcError(model, X, Y, printWrong=False):
    numCorrect = 0
    numIncorrect = 0
    incorrectList = []
    testCount = len(X)
    for index in range(testCount):
        x = X[index]
        prediction = model.predict(x)
        if prediction == Y[index]:
            numCorrect += 1
        else:
            numIncorrect += 1
            if printWrong:
                incorrectList.append((x_testSubnum[index], prediction, Y[index]))
    print 'Correctly Classified:', numCorrect
    print 'Incorrectly Classified:', numIncorrect
    print 'Correct: %f' % (float(numCorrect) / float(testCount))
    print 'Error: %f' % (1 - float(numCorrect) / float(testCount))
    if printWrong:
        print '====Incorrect Subjects===='
        # Print incorrect examples
        for x in incorrectList:
            print 'Incorrect Sub. %d, Prediction: %s, Correct Label: %s' % (x)
        print '=========================='

calcError(rfc, x_test, y_test)

Correctly Classified: 38
Incorrectly Classified: 8
Correct: 0.826087
Error: 0.173913
Wall time: 1.48 s


# Perform Grid Search

In [16]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [17]:
%%time
clf = RandomForestClassifier(n_estimators=30)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              #"n_estimators": [10,20,30,40,50],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 5.78 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.815 (std: 0.015)
Parameters: {'bootstrap': True, 'min_samples_leaf': 7, 'min_samples_split': 9, 'criterion': 'gini', 'max_features': 4, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.813 (std: 0.014)
Parameters: {'bootstrap': False, 'min_samples_leaf': 2, 'min_samples_split': 8, 'criterion': 'gini', 'max_features': 4, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.808 (std: 0.011)
Parameters: {'bootstrap': True, 'min_samples_leaf': 4, 'min_samples_split': 4, 'criterion': 'entropy', 'max_features': 5, 'max_depth': None}

Wall time: 5.78 s


In [18]:
# use a full grid over all parameters
param_grid = {"max_depth": [None],
              "max_features": [1, 5, len(features)],
              "min_samples_split": [1, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              #"n_estimators": [10,20,30,40,50],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

GridSearchCV took 52.47 seconds for 108 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.814 (std: 0.012)
Parameters: {'bootstrap': False, 'min_samples_leaf': 5, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.814 (std: 0.016)
Parameters: {'bootstrap': False, 'min_samples_leaf': 10, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.813 (std: 0.011)
Parameters: {'bootstrap': True, 'min_samples_leaf': 5, 'min_samples_split': 10, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}



# Testing on another Test Set

In [39]:
%%time
xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train, test_size=0.1, random_state=0)
print "Length of Training Set:", len(xtrain)
print "Length of Test Set:", len(xtest)
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(xtrain, ytrain)
calcError(rfc, xtest, ytest)

Length of Training Set: 2302
Length of Test Set: 407
Correctly Classified: 337
Incorrectly Classified: 70
Correct: 0.828010
Error: 0.171990
Wall time: 16.5 s


In [19]:
%%time
calcError(random_search, x_test, y_test)

Correctly Classified: 40
Incorrectly Classified: 6
Correct: 0.869565
Error: 0.130435
Wall time: 57 ms


In [41]:
%%time
calcError(grid_search.best_estimator_, x_test, y_test)

Correctly Classified: 39
Incorrectly Classified: 7
Correct: 0.847826
Error: 0.152174
Wall time: 58 ms
