In [1]:
import matplotlib.pyplot as plt

import random
from collections import Counter

import numpy as np

from sklearn.grid_search import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# Load Pickled Labels

In [2]:
directory = 'generated_data'

train_labels = np.load('{0}/train_labels.dat'.format(directory))
test_labels = np.load('{0}/test_labels.dat'.format(directory))
print('Training labels shape: {0}'.format(train_labels.shape))
print('Testing labels shape:  {0}'.format(test_labels.shape))

Training labels shape: (7326,)
Testing labels shape:  (2603,)


# Load Custom Conversion Data

In [3]:
train_CustGray_2d = np.load('{0}/train_CustGray_2d.dat'.format(directory))
test_CustGray_2d = np.load('{0}/test_CustGray_2d.dat'.format(directory))
print('Custom Conversion training shape: {0}'.format(train_CustGray_2d.shape))
print('Custom Conversion testing shape:  {0}'.format(test_CustGray_2d.shape))

Custom Conversion training shape: (7326, 640)
Custom Conversion testing shape:  (2603, 640)


# Load Otsu's Binarization Threshold Data

In [4]:
train_OBT_2d = np.load('{0}/train_OBT_2d.dat'.format(directory))
test_OBT_2d = np.load('{0}/test_OBT_2d.dat'.format(directory))
print('Otsu training shape: {0}'.format(train_OBT_2d.shape))
print('Otsu testing shape:  {0}'.format(test_OBT_2d.shape))

Otsu training shape: (7326, 640)
Otsu testing shape:  (2603, 640)


# Load Adaptive Mean Threshold Data

In [5]:
train_AMT_2d = np.load('{0}/train_AMT_2d.dat'.format(directory))
test_AMT_2d = np.load('{0}/test_AMT_2d.dat'.format(directory))
print('Adaptive mean training shape: {0}'.format(train_AMT_2d.shape))
print('Adaptive mean testing shape:  {0}'.format(test_AMT_2d.shape))

Adaptive mean training shape: (7326, 640)
Adaptive mean testing shape:  (2603, 640)


# Load Adaptive Gaussian Threshold Data

In [6]:
train_AGT_2d = np.load('{0}/train_AGT_2d.dat'.format(directory))
test_AGT_2d = np.load('{0}/test_AGT_2d.dat'.format(directory))
print('Adaptive Gaussian training shape: {0}'.format(train_AGT_2d.shape))
print('Adaptive Gaussian testing shape:  {0}'.format(test_AGT_2d.shape))

Adaptive Gaussian training shape: (7326, 640)
Adaptive Gaussian testing shape:  (2603, 640)


# Load Principle Component Analysis Data

In [7]:
train_PCA_2d = np.load('{0}/train_PCA_2d.dat'.format(directory))
test_PCA_2d = np.load('{0}/test_PCA_2d.dat'.format(directory))
print('PCA training shape: {0}'.format(train_PCA_2d.shape))
print('PCA testing shape:  {0}'.format(test_PCA_2d.shape))

PCA training shape: (7326, 40)
PCA testing shape:  (2603, 40)


# Lists of Datasets

In [45]:
names = ['Cust', 'OBT', 'AMT', 'AGT', 'PCA']
l_train = [train_CustGray_2d, train_OBT_2d, train_AMT_2d, train_AGT_2d, train_PCA_2d]
l_test = [test_CustGray_2d, test_OBT_2d, test_AMT_2d, test_AGT_2d, test_PCA_2d]
overall_dict = {}

# Visualize Important Pixels

In [9]:
def important_pixels(mdl, ht, wd, save_f):
    '''
    source: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances_faces.html#example-ensemble-plot-forest-importances-faces-py
    '''
    importances = mdl.feature_importances_
    importances = importances.reshape(ht,wd)

    # Plot pixel importances
    plt.matshow(importances, cmap=plt.cm.hot)
    plt.savefig(save_f)

# Ada Boost Classifier

In [23]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

# Attributes
# estimators_ : list of classifiers
# classes_ : array of shape = [n_classes]
# n_classes_ : int
# estimator_weights_ : array of floats
# estimator_errors_ : array of floats
# feature_importances_ : array of shape = [n_features]

ab_dict = {}

for i,dataset in enumerate(l_train):
    n_estimators=50
    ab_clf = AdaBoostClassifier(base_estimator=None, n_estimators=n_estimators, learning_rate=1.0, 
                                 algorithm='SAMME.R', random_state=None)
    scores = cross_val_score(ab_clf, dataset, train_labels)
    ab_dict[('ada_boost_{0}'.format(names[i]),n_estimators)] = scores.mean()*100

In [46]:
overall_dict['ada_boost'] = ab_dict
ab_dict

{('ada_boost_AGT', 50): 19.601571674291097,
 ('ada_boost_AMT', 50): 19.901917264897509,
 ('ada_boost_Cust', 50): 20.760998576551458,
 ('ada_boost_OBT', 50): 20.48886398197746,
 ('ada_boost_PCA', 50): 28.022917438729621}

# Bagging Classifier

In [33]:
from sklearn.ensemble import BaggingClassifier

# Attributes
# base_estimator_ : list of estimators
# estimators_ : list of estimators
# estimators_samples_ : list of arrays
# estimators_features_ : list of arrays
# classes_ : array of shape = [n_classes]
# n_classes_ : int or list
# oob_score_ : float
# oob_decision_function_ : array of shape = [n_samples, n_classes]

bag_dict = {}

for i,dataset in enumerate(l_train):
    bag_clf = BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, 
                                bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, 
                                n_jobs=-1, random_state=None, verbose=0)
    scores = cross_val_score(bag_clf, dataset, train_labels)
    bag_dict['bagging_{0}'.format(names[i])] = scores.mean()*100

In [47]:
overall_dict['bagging'] = bag_dict
bag_dict

{'bagging_AGT': 51.323460316333801,
 'bagging_AMT': 56.659094020191304,
 'bagging_Cust': 45.304261573046027,
 'bagging_OBT': 59.773823683722483,
 'bagging_PCA': 45.835940750864538}

# Gradient Boosting Classifier - TAKES A LONG TIME TO RUN

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Attributes
# feature_importances_ : array, shape = [n_features]
# oob_improvement_ : array, shape = [n_estimators]
# train_score_ : array, shape = [n_estimators]
# loss_ : LossFunction
# init : BaseEstimator
# estimators_ : ndarray of DecisionTreeRegressor, shape = [n_estimators, loss_.K]
    
gb_dict = {}

for i,dataset in enumerate(l_train):
    n_estimators=100
    gb_clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=n_estimators, subsample=1.0, 
                                     min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                     max_depth=3, init=None, random_state=None, max_features=None, verbose=0, 
                                     max_leaf_nodes=None, warm_start=False, presort='auto')
    scores = cross_val_score(gb_clf, dataset, train_labels)
    gb_dict[('gradient_boost_{0}'.format(names[i]),n_estimators)] = scores.mean()*100

In [None]:
overall_dict['gradient_boost'] = gb_dict
gb_dict

# Decision Tree Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier

# classes_ : array of shape = [n_classes] or a list of such arrays
# feature_importances_ : array of shape = [n_features]
# max_features_ : int,
# n_classes_ : int or list
# n_features_ : int
# n_outputs_ : int
# tree_ : Tree object

dtr_dict = {}

for i,dataset in enumerate(l_train):
    dtr_clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, 
                            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, 
                            max_leaf_nodes=None, class_weight=None, presort=False)

    scores = cross_val_score(dtr_clf, dataset, train_labels)
    dtr_dict['decision_tree_{0}'.format(names[i])] = scores.mean()*100

In [48]:
overall_dict['decision_tree'] = dtr_dict
dtr_dict

{'decision_tree_AGT': 36.144703900143284,
 'decision_tree_AMT': 40.171028268996253,
 'decision_tree_Cust': 32.570295247530275,
 'decision_tree_OBT': 45.412963166240502,
 'decision_tree_PCA': 33.170064779635595}

# Extra Trees Classifier

In [39]:
from sklearn.ensemble import ExtraTreesClassifier

etr_dict = {}

for i,dataset in enumerate(l_train):
    
    etr_clf = ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, 
                    min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                    bootstrap=False, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False, 
                    class_weight=None)

    scores = cross_val_score(etr_clf, dataset, train_labels)
    etr_dict['extra_trees_{0}'.format(names[i])] = scores.mean()*100

In [49]:
overall_dict['extra_trees'] = etr_dict
etr_dict

{'extra_trees_AGT': 51.555399075043461,
 'extra_trees_AMT': 57.69721530246126,
 'extra_trees_Cust': 43.993940734755469,
 'extra_trees_OBT': 60.046629152872391,
 'extra_trees_PCA': 38.710884891591277}

# Random Forrest Classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier

# Attributes
# estimators_ : list of DecisionTreeClassifier
# classes_ : array of shape = [n_classes] or a list of such arrays
# n_classes_ : int or list
# n_features_ : int
# n_outputs_ : int
# feature_importances_ : array of shape = [n_features]
# oob_score_ : float
# oob_decision_function_ : array of shape = [n_samples, n_classes]
    
rf_dict = {}

for i,dataset in enumerate(l_train):
    rf_clf = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', 
                                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, 
                                verbose=0, warm_start=False, class_weight=None)
    
    scores = cross_val_score(rf_clf, dataset, train_labels)
    rf_dict['random_forrest_{0}'.format(names[i])] = scores.mean()*100

In [50]:
overall_dict['random_forrest'] = rf_dict
rf_dict

{'random_forrest_AGT': 48.169410711487778,
 'random_forrest_AMT': 54.16189218059845,
 'random_forrest_Cust': 42.478717979674528,
 'random_forrest_OBT': 56.975636870828282,
 'random_forrest_PCA': 42.274185093343}

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_dict = {}
for i in range(1,20):
    knn_clf = KNeighborsClassifier(n_neighbors=i)
    knn_mdl = knn_clf.fit(train_AMT_2d, train_labels)
    knn_dict[i] = '{0:.2f}%'.format(knn_mdl.score(test_AMT_2d, test_labels)*100)

In [None]:
max(knn_dict.values())

In [None]:
print('Training score is:  {0:.2f}%'.format(knn_mdl.score(train_AMT_2d, train_labels)*100))
print('Test score is: {0:.2f}%'.format(knn_mdl.score(test_AMT_2d, test_labels)*100))

# SVM Classifier

In [None]:
_ = '''
svc_clf = SVC()
svc_mdl = svc_clf.fit(train_AMT_2d, train_labels)
'''

In [None]:
c = np.array([1., 10., 100., 1000.])
gamma = np.array([0.0001, 0.001, 0.01, 0.1])

#class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
#            tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, 

svd_dict = {}

for x in range(len(c)):
    for y in range(len(gamma)):
        svc_clf = SVC()
        svc_mdl = svc_clf.fit(train_CustGray_2d, train_labels)
        svd_dict[(c[x],gamma[y])] = svc_mdl.score(test_CustGray_2d, test_labels)*100

In [None]:
max(svd_dict.values())

In [None]:
print('Training score is:  {0}%'.format(svc_mdl.score(train_AMT_2d, train_labels)*100))
print('Test score is: {0}%'.format(svc_mdl.score(test_AMT_2d, test_labels)*100))

In [None]:
best_parameters = svc_grid.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))
    
print('Training score is', svc_mdl.score(train_data, train_labels))
print('Test score is', svc_mdl.score(test_data, test_labels))
print('-'*80)
print('Classification report of training data:\n', classification_report(train_labels, svc_mdl.predict(train_data)))
print('-'*80)
print('Classification report of test data:\n', classification_report(test_labels, svc_mdl.predict(test_data)))

# Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
param_grid = {
            'penalty': ['l2'],
            'class_weight': ['balanced'],
            'solver': ['newton-cg', 'lbfgs', 'sag'],
            'multi_class': ['ovr', 'multinomial']
            }


#logr_grid = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, shuffle=True, verbose=0, 
#                    n_jobs=4, random_state=None, learning_rate='optimal')

logr_grid = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid)
logr_mdl = logr_grid.fit(train_CustGray_2d, train_labels)

In [None]:
print('Training score is', logr_mdl.score(train_data, train_labels))
print('Validation score is', logr_mdl.score(test_data, test_labels))
print('-'*80)
print('Classification report of training data:\n', classification_report(train_labels, logr_mdl.predict(train_data)))
print('-'*80)
print('Classification report of validation data:\n', classification_report(test_labels, logr_mdl.predict(test_data)))