In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import ensemble
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
import time
import os

from assistments_workbench.config_reader import config

In [26]:
data = pd.read_csv(os.path.join(config.get('localfiles', 'data_path'), 'arrs_model_data.csv'))

In [27]:
# data = data.loc[data['adaptive_mode'] == 0]

In [28]:
data = data.loc[data['delay_days'] == 7]

In [29]:
data.describe()

Unnamed: 0,class_grade,current_grade,sequence_id,delay_days,work_delay_in_day,prereq_seq_performance,ms,easiness,class_performance,class_assignment_performance,correct
count,33906.0,33906.0,33906.0,33906,33906.0,33906.0,33906.0,33906.0,33906.0,33906.0,33906.0
mean,7.145225,7.279803,107654.675279,7,22.636466,0.731683,5.233263,0.73064,0.716627,0.760222,0.810859
std,1.21497,1.292595,166553.159006,0,33.173737,0.094494,3.915801,0.143877,0.048088,0.126498,0.391626
min,5.0,0.0,5897.0,7,1.0,0.25,1.0,0.0,0.399967,0.230769,0.0
25%,6.0,6.0,7195.0,7,3.0,0.676471,3.0,0.637255,0.686536,0.673077,1.0
50%,8.0,8.0,21257.0,7,9.0,0.740741,4.0,0.754032,0.713132,0.776471,1.0
75%,8.0,8.0,164496.0,7,27.0,0.797561,6.0,0.833612,0.754116,0.858974,1.0
max,11.0,13.0,687015.0,7,261.0,0.99359,70.0,1.0,0.842996,1.0,1.0


In [30]:
labels = data['correct']

In [31]:
features = data.drop(['correct'], axis=1)

In [32]:
trainLabels = labels

In [33]:
trainFeatures = features

In [34]:
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(trainFeatures, trainLabels, 
                                                                       test_size=0.5, random_state=1)


# verySimpleLearner = ensemble.GradientBoostingClassifier(n_estimators=20, max_features=1, max_depth=3, 
#                                                         min_samples_leaf=100, learning_rate=0.1, 
#                                                         subsample=0.65, loss='deviance', random_state=1)

verySimpleLearner = LogisticRegression()

startTime = time.time()
singleFeatureTable = pd.DataFrame(index=range(len(X_train.columns)), columns=['feature','AUC'])
for k,feature in enumerate(X_train.columns):
    trainInputFeature = X_train[feature].values.reshape(-1,1)
    validInputFeature = X_valid[feature].values.reshape(-1,1)
    verySimpleLearner.fit(trainInputFeature, y_train)
    
    validAUC = auc(y_valid, verySimpleLearner.predict_proba(validInputFeature)[:,1])
    singleFeatureTable.ix[k,'feature'] = feature
    singleFeatureTable.ix[k,'AUC'] = validAUC
        
print("finished evaluating single features. took %.2f minutes" %((time.time()-startTime)/60))

finished evaluating single features. took 0.00 minutes


In [35]:
singleFeatureTable = singleFeatureTable.sort_values(by='AUC', axis=0, ascending=False).reset_index(drop=True)

singleFeatureTable.ix[:15,:]

Unnamed: 0,feature,AUC
0,easiness,0.707435
1,class_assignment_performance,0.666973
2,prereq_seq_performance,0.63228
3,work_delay_in_day,0.611465
4,ms,0.583425
5,class_performance,0.581483
6,sequence_id,0.543571
7,class_grade,0.53762
8,delay_days,0.5
9,current_grade,0.493273


In [36]:
numFeaturesInCombination = 5
numCombinations = 200
numBestSingleFeaturesToSelectFrom = 20

X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(trainFeatures, trainLabels, 
                                                                       test_size=0.5, random_state=1)
# weakLearner = ensemble.GradientBoostingClassifier(n_estimators=30, max_features=2, max_depth=3, 
#                                                   min_samples_leaf=100,learning_rate=0.1, 
#                                                   subsample=0.65, loss='deviance', random_state=1)

weakLearner = LogisticRegression()

featuresToUse = singleFeatureTable.ix[0:numBestSingleFeaturesToSelectFrom-1,'feature']
featureColumnNames = ['feature'+str(x+1) for x in range(numFeaturesInCombination)]
featureCombinationsTable = pd.DataFrame(index=range(numCombinations), columns=featureColumnNames + ['combinedAUC'])

# for numCombinations iterations 
startTime = time.time()
for combination in range(numCombinations):
    # generate random feature combination
    randomSelectionOfFeatures = sorted(np.random.choice(len(featuresToUse), numFeaturesInCombination, replace=False))

    # store the feature names
    combinationFeatureNames = [featuresToUse[x] for x in randomSelectionOfFeatures]
    for i in range(len(randomSelectionOfFeatures)):
        featureCombinationsTable.ix[combination,featureColumnNames[i]] = combinationFeatureNames[i]

    # build features matrix to get the combination AUC
    trainInputFeatures = X_train.ix[:,combinationFeatureNames]
    validInputFeatures = X_valid.ix[:,combinationFeatureNames]
    # train learner
    weakLearner.fit(trainInputFeatures, y_train)
    # store AUC results
    validAUC = auc(y_valid, weakLearner.predict_proba(validInputFeatures)[:,1])        
    featureCombinationsTable.ix[combination,'combinedAUC'] = validAUC

validAUC = np.array(featureCombinationsTable.ix[:,'combinedAUC'])
print("(min,max) AUC = (%.4f,%.4f). took %.1f minutes" % (validAUC.min(),validAUC.max(), (time.time()-startTime)/60))

# show the histogram of the feature combinations performance 
plt.figure(); plt.hist(validAUC, 100, facecolor='blue', alpha=0.75)
plt.xlabel('AUC'); plt.ylabel('frequency'); plt.title('feature combination AUC histogram'); plt.show()

(min,max) AUC = (0.4608,0.7655). took 0.1 minutes


In [37]:
featureCombinationsTable = featureCombinationsTable.sort_values(by='combinedAUC', axis=0, ascending=False).reset_index(drop=True)
featureCombinationsTable.ix[:20,:]

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,combinedAUC
0,easiness,class_assignment_performance,prereq_seq_performance,work_delay_in_day,current_grade,0.765538
1,easiness,class_assignment_performance,prereq_seq_performance,work_delay_in_day,ms,0.76509
2,easiness,class_assignment_performance,prereq_seq_performance,work_delay_in_day,ms,0.76509
3,easiness,prereq_seq_performance,work_delay_in_day,class_grade,current_grade,0.76454
4,easiness,prereq_seq_performance,work_delay_in_day,class_grade,current_grade,0.76454
5,easiness,prereq_seq_performance,work_delay_in_day,class_grade,current_grade,0.76454
6,easiness,class_assignment_performance,prereq_seq_performance,class_grade,current_grade,0.762513
7,easiness,class_assignment_performance,prereq_seq_performance,ms,class_grade,0.761165
8,easiness,prereq_seq_performance,work_delay_in_day,delay_days,current_grade,0.759394
9,easiness,prereq_seq_performance,work_delay_in_day,delay_days,current_grade,0.759394
