In [112]:
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arff
from operator import itemgetter
from scipy.stats import randint as sp_randint
from time import time
import random
import csv


from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import linear_model
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from IPython.core.display import Image

import pydot

from plot_learning_curve import plot_learning_curve

# Load Training Set

In [121]:
%%time
gender = {'Male' : 0, 'Female' : 1, 'Unknown' : 2}
language = {'English' : 0, 'EnglishandAnother' : 1, 'Another' : 2}

trainfile = 'sample_data\\train_conversions.arff'
trainfile = 'sample_data\\train_means.arff'
#trainfile = 'sample_data\\train_conversions_norm.arff'

x_train = []
x_trainSubnum = []
features = []
y_train = []

with open(trainfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    features = [arffFile['attributes'][1:-3][i][0].encode("ascii") for i in range(len(arffFile['attributes'][1:-3]))]
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_trainSubnum.append(row[0]) # sepporate subject number
        x_train.append(row[1:-3]) # do not include number of credits taken first year
        y_train.append(row[-1])
        
print x_train[0]
print 'Number of examples:', len(x_train)
print 'Features: ', len(features)

[1, 300.0, 48000.0, 590.0, 470.0, 570.0, 1630.0, 4.0, 33.0, 24.0, 29.0, 5.88602941176, 0, 4.0]
Number of examples: 3002
Features:  14
Wall time: 141 ms


# 10-Fold Cross Validation

In [100]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
scores = cross_val_score(rfc, x_train, y_train, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.81 (+/- 0.05)
Wall time: 33.9 s


# Fit RandomForests

In [122]:
%%time
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(x_train, y_train)
print rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Wall time: 3.78 s


# Evaluate on Test Set

In [131]:
%%time
testfile = 'sample_data\\no_lables.arff'
#testfile = 'sample_data\\pollo_arff.arff'

x_test = []
x_testSubnum = []
y_test = []

with open(testfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_testSubnum.append(row[0]) # sepporate subject number
        #x_test.append(row[1:-3]) # do not include number of credits taken first year
        x_test.append(row[1:]) # do not include number of credits taken first year
        y_test.append(row[-1])
        
print x_test[0]
print 'Number of examples:', len(x_test)

[1, 0.0, 175000.0, 630.0, 630.0, 590.0, 1850.0, 3.72, 24.688073, 24.174312, 22.906542, 10.0, 2, 3.45]
Number of examples: 300
Wall time: 11 ms


In [124]:
predictions = []
for index in range(len(x_test)):
    x = x_test[index]
    predictions.append([x_testSubnum[index], rfc.predict(x)[0]])
    
print predictions[1][1]

1


In [128]:
with open('saveme.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',', quotechar=' ', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['Subjnum', 'Firstyrcumgpa'])
    for i in predictions:
        spamwriter.writerow(i)

In [103]:
%%time
def calcError(model, X, Y, printWrong=False):
    numCorrect = 0
    numIncorrect = 0
    incorrectList = []
    testCount = len(X)
    for index in range(testCount):
        x = X[index]
        prediction = model.predict(x)
        if prediction == Y[index]:
            numCorrect += 1
        else:
            numIncorrect += 1
            if printWrong:
                incorrectList.append((x_testSubnum[index], prediction, Y[index]))
    print 'Correctly Classified:', numCorrect
    print 'Incorrectly Classified:', numIncorrect
    print 'Correct: %f' % (float(numCorrect) / float(testCount))
    print 'Error: %f' % (1 - float(numCorrect) / float(testCount))
    if printWrong:
        print '====Incorrect Subjects===='
        # Print incorrect examples
        for x in incorrectList:
            print 'Incorrect Sub. %d, Prediction: %s, Correct Label: %s' % (x)
        print '=========================='

calcError(rfc, x_test, y_test)

Correctly Classified: 0
Incorrectly Classified: 300
Correct: 0.000000
Error: 1.000000
Wall time: 10.4 s


# Perform Grid Search

In [104]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [105]:
%%time
clf = RandomForestClassifier(n_estimators=30)

# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              #"n_estimators": [10,20,30,40,50],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
start = time()
random_search.fit(x_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 7.53 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.814 (std: 0.006)
Parameters: {'bootstrap': False, 'min_samples_leaf': 4, 'min_samples_split': 5, 'criterion': 'gini', 'max_features': 5, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.812 (std: 0.020)
Parameters: {'bootstrap': True, 'min_samples_leaf': 3, 'min_samples_split': 4, 'criterion': 'gini', 'max_features': 3, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.810 (std: 0.022)
Parameters: {'bootstrap': True, 'min_samples_leaf': 8, 'min_samples_split': 7, 'criterion': 'entropy', 'max_features': 5, 'max_depth': None}

Wall time: 7.86 s


In [106]:
# use a full grid over all parameters
param_grid = {"max_depth": [None],
              "max_features": [1, 5, len(features)],
              "min_samples_split": [1, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              #"n_estimators": [10,20,30,40,50],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(x_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

KeyboardInterrupt: 

# Testing on another Test Set

In [None]:
%%time
xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train, test_size=0.1, random_state=0)
print "Length of Training Set:", len(xtrain)
print "Length of Test Set:", len(xtest)
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(xtrain, ytrain)
calcError(rfc, xtest, ytest)

In [None]:
%%time
calcError(random_search, x_test, y_test)

In [None]:
%%time
calcError(grid_search.best_estimator_, x_test, y_test)