In [116]:
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arff
from operator import itemgetter
from scipy.stats import randint as sp_randint
from time import time
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import linear_model
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
from IPython.core.display import Image

import pydot

from plot_learning_curve import plot_learning_curve

# Load Training Set

In [117]:
%%time
gender = {'Male' : 0, 'Female' : 1, 'Unknown' : 2}
language = {'English' : 0, 'EnglishandAnother' : 1, 'Another' : 2}

trainfile = 'sample_data\\train_subj_norm.arff'

x_train = []
x_trainSubnum = []
features = []
y_train = []

with open(trainfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    features = [arffFile['attributes'][1:-3][i][0].encode("ascii") for i in range(len(arffFile['attributes'][1:-3]))]
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_trainSubnum.append(row[0]) # sepporate subject number
        x_train.append(row[1:-3]) # do not include number of credits taken first year
        y_train.append(row[-1])
        
print 'Number of examples:', len(x_train)
print 'Features: ', features

Number of examples: 2755
Features:  ['gender', 'Firgen', 'famincome', 'SATCRDG', 'SATMATH', 'SATWRTG', 'SATTotal', 'HSGPA', 'ACTRead', 'ACTMath', 'ACTEngWrit', 'APIScore', 'FirstLang', 'HSGPAunweighted']
Wall time: 117 ms


# Fit RandomForests

In [118]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(x_train, y_train)
print rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Wall time: 3.3 s


# Evaluate on Test Set

In [119]:
%%time
testfile = 'sample_data\\test_subj_norm.arff'

x_test = []
x_testSubnum = []
y_test = []

with open(testfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_testSubnum.append(row[0]) # sepporate subject number
        x_test.append(row[1:-3]) # do not include number of credits taken first year
        y_test.append(row[-1])
        
print 'Number of examples:', len(x_test)

Number of examples: 247
Wall time: 11 ms


In [120]:
%%time
def calcError(model, X, Y, printWrong=False):
    numCorrect = 0
    numIncorrect = 0
    incorrectList = []
    testCount = len(X)
    for index in range(testCount):
        x = x_test[index]
        prediction = model.predict(x)
        if prediction == Y[index]:
            numCorrect += 1
        else:
            numIncorrect += 1
            incorrectList.append((x_testSubnum[index], prediction, y_test[index]))
    print 'Correctly Classified:', numCorrect
    print 'Incorrectly Classified:', numIncorrect
    print 'Correct: %f' % (float(numCorrect) / float(testCount))
    print 'Error: %f' % (1 - float(numCorrect) / float(testCount))
    if printWrong:
        print '====Incorrect Subjects===='
        # Print incorrect examples
        for x in incorrectList:
            print 'Incorrect Sub. %d, Prediction: %s, Correct Label: %s' % (x)
        print '=========================='

calcError(rfc, x_test, y_test)

Correctly Classified: 206
Incorrectly Classified: 41
Correct: 0.834008
Error: 0.165992
Wall time: 10.9 s


# Perform Grid Search

In [121]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [122]:
%%time
clf = RandomForestClassifier(n_estimators=20)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

Wall time: 2 ms


In [123]:
start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

RandomizedSearchCV took 4.65 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.809 (std: 0.010)
Parameters: {'bootstrap': True, 'min_samples_leaf': 4, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.808 (std: 0.010)
Parameters: {'bootstrap': True, 'min_samples_leaf': 5, 'min_samples_split': 6, 'criterion': 'gini', 'max_features': 4, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.804 (std: 0.017)
Parameters: {'bootstrap': False, 'min_samples_leaf': 9, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 9, 'max_depth': None}

GridSearchCV took 41.03 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.813 (std: 0.017)
Parameters: {'bootstrap': False, 'min_samples_leaf': 10, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.81

In [124]:
%%time
calcError(random_search, x_test, y_test)

Correctly Classified: 199
Incorrectly Classified: 48
Correct: 0.805668
Error: 0.194332
Wall time: 243 ms


In [125]:
%%time
calcError(grid_search, x_test, y_test)

Correctly Classified: 205
Incorrectly Classified: 42
Correct: 0.829960
Error: 0.170040
Wall time: 240 ms
