**WARNING: This is a rough workpad. Steps may not be sequential and there is no coherent narrative.**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import h5py
from collections import OrderedDict

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [2]:
with h5py.File('./../pickles/final_training_data.h5','r') as hf:
    print('List of arrays in this file: \n', hf.keys())
    Xtrain = np.array(hf.get('dataset_1'))
    print('Shape of the array dataset_1: \n', Xtrain.shape)

('List of arrays in this file: \n', [u'dataset_1'])
('Shape of the array dataset_1: \n', (14048, 50))


In [3]:
with h5py.File('./../pickles/final_test_data.h5','r') as hf:
    print('List of arrays in this file: \n', hf.keys())
    Xtest = np.array(hf.get('dataset_1'))
    print('Shape of the array dataset_1: \n', Xtest.shape)

('List of arrays in this file: \n', [u'dataset_1'])
('Shape of the array dataset_1: \n', (3599, 50))


In [4]:
Xtrain.shape, Xtest.shape

((14048, 50), (3599, 50))

In [5]:
ytrain = pd.read_csv('./../pickles/training_targets.csv')
ytest = pd.read_csv('./../pickles/test_targets.csv')

In [6]:
ytrain.shape, ytest.shape

((14048, 1), (3599, 1))

In [None]:
# #encode labels if needed

# outcomes = targets.category.unique()
# i = range(len(outcomes))

# label_bins_tups = zip(outcomes, i)

# label_bins = OrderedDict(label_bins_tups)

# label_bins.values()

# targets['ybinarized'] = targets.category.map(label_bins)

# ybin = targets.ybinarized

In an exploratory run back in the "Build_Test_Train" notebook, I discovered that Logistic Regression, and Random Forests provide (with default parameters) and with **class weights based sampling** turned on, gave the highest and second highest Macro F1 score on the hold out sets, *respectively*. I'll now run a Grid Search to tune the parameters for each algorithm and the discerning score will be the macro F1 score.



In [10]:

def best_config(model_info, parameters, train_instances, judgements, cv):
    """
    Takes in a classifier model with a grid of parameter values to combinatorially explore.
    Returns classifier object with best configuration, the best tuning score (macro f1) 
    """
    [name, model] = model_info
    print 'Grid search for... ' + name
    clf = GridSearchCV(model, parameters, cv=cv, scoring="f1_macro", verbose=1, n_jobs=-1)
    clf.fit(train_instances, np.array(judgements).ravel())
    best_estimator = clf.best_estimator_
    print 'Best configuration: ' + str(clf.best_params_) + 'Best CV score (macro f1): ' + str(clf.best_score_)
    return [str(clf.best_params_), clf.best_score_, best_estimator, clf]


In [8]:
# Returns the best model from a set of model families given  training data using crosvalidation
def best_model(classifier_families, train_instances, judgements, cv, holdout_feats, holdout_targets):
    """
    sends each model and parameter grid specifid in classifier_families to the best_config function to 
    tune and cross validate over the parameters. 
    Returns the best performing classifier amongst them all
    """
    best_quality = 0.0
    best_classifier = None    
    classifiers = []
     #Update ash:
    #Because I'm forcing the training with balanced samples (with the possibility of over sampling)
    #I should retest each classifier on the holdout test set. The scores of this one will be used to gauage the 
    #best classifier
    
    #(fig, (ax1, ax2, ax3)) = plt.subplots(ncols=1, nrows=3, figsize=(15,10))
#     plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Chance')
#     plt.set_xlim([-0.05, 1.05])
#     plt.set_ylim([-0.05, 1.05])
#     plt.set_xlabel('False Positive Rate')
#     plt.set_ylabel('True Positive Rate')
#     plt.set_title('Receiver Operating Characteristic)')
#     plt.legend(loc="lower right")
#     plt.tight_layout()
        
    for name, model, parameters in classifier_families:
        
        bestparams, bestscore, bestest, clf = best_config([name, model], parameters, train_instances, judgements, cv)
        holdout_preds = clf.predict(holdout_feats) #predict using best model on holdout set
        #probas_ = clf.predict_proba(holdout_feats)
        
        mf1 =  f1_score(holdout_targets, holdout_preds,average='macro')
        
#         #plot ROC 
#         fpr, tpr, thresholds = roc_curve(holdout_targets, probas_[:, 1])
#         auc = roc_auc_score(holdout_targets, probas_[:, 1],average='macro')
#         plt.plot(fpr, tpr, lw=1, label='%s has macro f1 = %f and avg. auc = %f' % (name, mf1, auc))
     
        print 'Holdout Macro F1 performance for %s = %f \n' % (name, mf1)

        classifiers.append((bestparams, bestscore, bestest, mf1))
        
        


    
    for name, cv_quality, classifier, holdout_quality in classifiers:
        print 'Considering classifier... ' + name
        if (holdout_quality > best_quality):
            best_quality = holdout_quality
            best_classifier = [name, classifier]

    print 'Best classifier... ' + best_classifier[0]
    return best_classifier[1]

In [11]:
def candidate_families():
    """
    Setup to compare classifiers by specifying their tuning parameters to be tested and 
    by a grid search using the Macro F1 score to pick the winner
    """
    candidates = []
    
    #SVM Linear Kernel (probably better for high dim space)
    svm_tuned_parameters = [{'kernel': ['linear','rbf'], 
                            'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
                             'gamma':[0.0000001, 0.000001, 0.0001, 0.001, 0.1, 1, 10, 100],
                            'class_weight':['balanced',None]}]
    candidates.append(["SVM", SVC(), svm_tuned_parameters])
    
    Random Forest
    rf_tuned_parameters = [{"n_estimators": [75, 500, 1500],
                           "criterion":['gini','entropy'],
                           "class_weight":['balanced','balanced_subsample'],
                           "min_samples_leaf":[5, 15, 50, 150]}]
    candidates.append(["RandomForest", RandomForestClassifier(n_jobs=6), rf_tuned_parameters]) 
    
    #Logistic Regression
    LR_tuned_parameters = [{"penalty": ['l1','l2'],
                           "class_weight":[None,'balanced'],
                           "C":[0.001, 0.01, 0.1, 1, 10, 100, 1000]}]
    candidates.append(["LogisticRegression", LogisticRegression(), LR_tuned_parameters])
    
    
    
    return candidates

In [None]:
classifier = best_model(candidate_families(), Xtrain, ytrain, 5, Xtest, ytest)


Before the error, we discovered that the SVM with the parameters below still performed horribly on the Holdout set with a Macro F1 of 0.068256.

> Best configuration: {'kernel': 'rbf', 'C': 1000, 'gamma': 0.1, 'class_weight': 'balanced'}   
Cross Validation F1 Score (balanced dataset) = 0.216158415487  

> Holdout Macro F1 performance for SVM = 0.068256 


In [None]:
classifier = best_model(candidate_families(), Xtrain, ytrain, 3, Xtest, ytest) 
#re-running after some code tweaks and commenting out SVM

In [None]:
classifier = best_model(candidate_families(), Xtrain, ytrain, 3, Xtest, ytest) 

Grid search for... LogisticRegression
Fitting 3 folds for each of 28 candidates, totalling 84 fits


**have to redo. Forgot to transform test set using training vectorizer!!!!**

In [None]:
def plot_ROC_PR(Xtrain, ytrain, k, rf_estimators, class_subsampling=None, cutoff=0.5):
    """
    Plot the ROC curve and AUC metric for given dataset, along with k-fold cross validation.
    Use a Random Forest classifier with ideal number of estimators pre-determined
    """
    cv = StratifiedKFold(np.array(ytrain).ravel(), n_folds=k) #preserves class %
    clf_rf = RandomForestClassifier(n_estimators=rf_estimators, verbose=0, criterion='gini', 
                                    n_jobs=-1, class_weight=class_subsampling) 
    #clf_LR = LogisticRegression(n_jobs=-1, class_weight=class_subsampling)
    
    #plt.figure(figsize=(15,7))
    #(fig, (ax1, ax2)) = plt.subplots(ncols=1, nrows=2, figsize=(15,10))
    for i, (train, cval) in enumerate(cv):
        print 'fitting LR on cv run {}...\n'.format(i)
        clf_rf.fit(Xtrain[train], np.array(ytrain[train]).ravel())
        #clf_LR.fit(Xtrain[train], np.array(ytrain[train]).ravel())

        probas_ = clf_rf.predict_proba(Xtrain[cval])
        #probas_ = clf_LR.predict_proba(Xtrain[cval])

        ypred = (probas_[:,1] > cutoff).astype(int)
        
        # Compute ROC curve and area under the curve
        #fpr, tpr, thresholds = roc_curve(ytrain.iloc[cval], probas_[:, 1], pos_label=1)
        #precision, recall, thresholds = precision_recall_curve(ytrain.iloc[cval], probas_[:,1], pos_label=1) #sample_weight=np.where(ytrain.iloc[train]==1, 1.0, 30.0).ravel())
        f1 = f1_score(ytrain[cval], ypred, labels=label_bins.values(), average='macro')
        print 'Cross Validation #{} macro F1 score = {}\n'.format(i, f1)
        
        print 'Classification Report for CV run {}: \n'.format(i)
        print(classification_report(ytrain[cval], ypred, labels=label_bins.values(), 
                                    target_names=label_bins.keys(), 
                                    sample_weight=None, digits=5))
        
        
        
        #mean_tpr += interp(mean_fpr, fpr, tpr)
        #mean_tpr[0] = 0.0
        #roc_auc = auc(fpr, tpr)
        #ax1.plot(fpr, tpr, lw=1, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
        #ax2.plot(recall, precision, lw=1, label='ROC fold %d (F1 = %0.2f)' % (i, f1))
        #plt.plot(fpr, thresholds, lw=2, label='Decision Threshold at fold %d (%f)' % (i, np.mean(thresholds)))
    
#     ax1.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random Chance')
#     ax1.set_xlim([-0.05, 1.05])
#     ax1.set_ylim([-0.05, 1.05])
#     ax1.set_xlabel('False Positive Rate\n(% of Genuine Merchants Misclassified for Fraud)')
#     ax1.set_ylabel('True Positive Rate\n(% of All Fraudulent Merchants Caught from Dataset)')
#     ax1.set_title('Receiver Operating Characteristic\nLogistic Regression (Decision Threshold = {:02.1f})'.format(cutoff))
#     ax1.legend(loc="lower right")
    
    #ax2.set_xlim([-0.05, 1.05])
    #ax2.set_ylim([-0.05, 1.05])
#     ax2.set_xlabel('Sensitivity\n(% of All Fraudulent Merchants Caught from Dataset)')
#     ax2.set_ylabel('Precision\n(% Fraudulent Merchants Correctly Identified)')
#     ax2.set_title('Precision-Recall Curve\nLogistic Regression (Decision Threshold = {:02.1f})'.format(cutoff))
#     ax2.legend(loc="lower right")
    
#     plt.tight_layout()

In [None]:
plot_ROC_PR(dat, ybin, 2, 500, 'balanced_subsample', 0.5)

In [None]:
#pipelining classifiers
estimators = [#('reduce_dim', PCA(n_components=0.90)), ('svm linear kernel', LinearSVC()), 
              ('logistic reg', LogisticRegression(n_jobs=-1))]
              #('random forest', RandomForestClassifier(n_estimators=300, n_jobs=-1))]


clf = Pipeline(estimators)


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(dat, ybin, stratify=ybin)

In [None]:
%%time
clf.fit(xtrain, ytrain)

In [None]:
print 'hi'

In [None]:
preds = clf.predict(xtest)

In [None]:
macrof1 = f1_score(ytest, preds, average='macro')

In [None]:
macrof1

In [None]:
confusion_matrix(ytest, preds)

In [None]:
print(classification_report(ytest, preds, labels=label_bins.values(), 
                                    target_names=label_bins.keys(), 
                                    sample_weight=None, digits=5))