In [1]:
%matplotlib inline
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import arff
import random

from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import linear_model
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
from IPython.core.display import Image

import pydot

from plot_learning_curve import plot_learning_curve


Plotting Learning Curves

On the left side the learning curve of a naive Bayes classifier is shown for
the digits dataset. Note that the training score and the cross-validation score
are both not very good at the end. However, the shape of the curve can be found
in more complex datasets very often: the training score is very high at the
beginning and decreases and the cross-validation score is very low at the
beginning and increases. On the right side we see the learning curve of an SVM
with RBF kernel. We can see clearly that the training score is still around
the maximum and the validation score could be increased with more training
samples.



# Load Training Set

In [2]:
%%time
gender = {'Male' : 0, 'Female' : 1, 'Unknown' : 2}
language = {'English' : 0, 'EnglishandAnother' : 1, 'Another' : 2}

trainfile = 'sample_data\\train_subj_norm.arff'

x_train = []
x_trainSubnum = []
features = []
y_train = []

with open(trainfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    features = [arffFile['attributes'][1:-3][i][0].encode("ascii") for i in range(len(arffFile['attributes'][1:-3]))]
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_trainSubnum.append(row[0]) # sepporate subject number
        x_train.append(row[1:-3]) # do not include number of credits taken first year
        y_train.append(row[-1])
        
print 'Number of examples:', len(x_train)
print 'Features: ', features

Number of examples: 2755
Features:  ['gender', 'Firgen', 'famincome', 'SATCRDG', 'SATMATH', 'SATWRTG', 'SATTotal', 'HSGPA', 'ACTRead', 'ACTMath', 'ACTEngWrit', 'APIScore', 'FirstLang', 'HSGPAunweighted']
Wall time: 108 ms


# Fit RandomForests

In [3]:
%%time
rfc = ExtraTreesClassifier(n_estimators=1000)
rfc.fit(x_train, y_train)
print rfc

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
Wall time: 3.19 s


# Evaluate on Test Set

In [4]:
%%time
testfile = 'sample_data\\test_subj_norm.arff'

x_test = []
x_testSubnum = []
y_test = []

with open(testfile, 'rb') as af:
    arffFile = arff.load(af)
    # do not include college gpa, college credits and subject number
    data = arffFile['data']
    for row in data:
        row[1] = gender[row[1]] # index of gender
        row[13] = language[row[13]] # index of language
        x_testSubnum.append(row[0]) # sepporate subject number
        x_test.append(row[1:-3]) # do not include number of credits taken first year
        y_test.append(row[-1])
        
print 'Number of examples:', len(x_test)

Number of examples: 247
Wall time: 10 ms


In [5]:
%%time
numCorrect = 0
numIncorrect = 0
incorrectList = []
testCount = len(x_test)
for index in range(testCount):
    x = x_test[index]
    prediction = rfc.predict(x)
    if prediction == y_test[index]:
        numCorrect += 1
    else:
        numIncorrect += 1
        incorrectList.append((x_testSubnum[index], prediction, y_test[index]))
print 'Correctly Classified:', numCorrect
print 'Incorrectly Classified:', numIncorrect
print 'Correct: %f' % (float(numCorrect) / float(testCount))
print 'Error: %f' % (1 - float(numCorrect) / float(testCount))
print '====Incorrect Subjects===='
# Print incorrect examples
for x in incorrectList:
    print 'Incorrect Sub. %d, Prediction: %s, Correct Label: %s' % (x)
print '=========================='

Correctly Classified: 206
Incorrectly Classified: 41
Correct: 0.834008
Error: 0.165992
====Incorrect Subjects====
Incorrect Sub. 48, Prediction: [u'0'], Correct Label: 1
Incorrect Sub. 121, Prediction: [u'0'], Correct Label: 1
Incorrect Sub. 122, Prediction: [u'2'], Correct Label: 1
Incorrect Sub. 124, Prediction: [u'0'], Correct Label: 1
Incorrect Sub. 152, Prediction: [u'2'], Correct Label: 1
Incorrect Sub. 169, Prediction: [u'3'], Correct Label: 1
Incorrect Sub. 185, Prediction: [u'0'], Correct Label: 1
Incorrect Sub. 191, Prediction: [u'4'], Correct Label: 1
Incorrect Sub. 296, Prediction: [u'1'], Correct Label: 0
Incorrect Sub. 329, Prediction: [u'4'], Correct Label: 0
Incorrect Sub. 354, Prediction: [u'2'], Correct Label: 0
Incorrect Sub. 397, Prediction: [u'4'], Correct Label: 0
Incorrect Sub. 405, Prediction: [u'2'], Correct Label: 0
Incorrect Sub. 432, Prediction: [u'2'], Correct Label: 0
Incorrect Sub. 501, Prediction: [u'2'], Correct Label: 0
Incorrect Sub. 591, Prediction: 