In [2]:
import random
from collections import Counter

import numpy as np

from sklearn.grid_search import GridSearchCV

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# Load Pickled Labels

In [3]:
directory = 'generated_data'

train_labels = np.load('{0}/train_labels.dat'.format(directory))
test_labels = np.load('{0}/test_labels.dat'.format(directory))
print('Training labels shape: {0}'.format(train_labels.shape))
print('Testing labels shape:  {0}'.format(test_labels.shape))

Training labels shape: (7326,)
Testing labels shape:  (2603,)


# Load Custom Conversion Data

In [4]:
train_CustGray_2d = np.load('{0}/train_CustGray_2d.dat'.format(directory))
test_CustGray_2d = np.load('{0}/test_CustGray_2d.dat'.format(directory))
print('Custom Conversion training shape: {0}'.format(train_CustGray_2d.shape))
print('Custom Conversion testing shape:  {0}'.format(test_CustGray_2d.shape))

Custom Conversion training shape: (7326, 640)
Custom Conversion testing shape:  (2603, 640)


# Load Otsu's Binarization Threshold Data

In [13]:
train_OCAT_2d = np.load('{0}/train_OCAT_2d.dat'.format(directory))
test_OCAT_2d = np.load('{0}/test_OCAT_2d.dat'.format(directory))
print('Otsu training shape: {0}'.format(train_OCAT_2d.shape))
print('Otsu testing shape:  {0}'.format(test_OCAT_2d.shape))

Otsu training shape: (7326, 640)
Otsu testing shape:  (2603, 640)


# Load Adaptive Mean Threshold Data

In [19]:
train_AMT_2d = np.load('{0}/train_AMT_2d.dat'.format(directory))
test_AMT_2d = np.load('{0}/test_AMT_2d.dat'.format(directory))
print('Adaptive mean training shape: {0}'.format(train_AMT_2d.shape))
print('Adaptive mean testing shape:  {0}'.format(test_AMT_2d.shape))

Adaptive mean training shape: (7326, 640)
Adaptive mean testing shape:  (2603, 640)


# KNN

In [23]:
from sklearn.neighbors import KNeighborsClassifier

knn_dict = {}
for i in range(1,20):
    knn_clf = KNeighborsClassifier(n_neighbors=i)
    knn_mdl = knn_clf.fit(train_AMT_2d, train_labels)
    knn_dict[i] = '{0:.2f}%'.format(knn_mdl.score(test_AMT_2d, test_labels)*100)

In [33]:
max(knn_dict.values())

'63.62%'

In [25]:
print('Training score is:  {0:.2f}%'.format(knn_mdl.score(train_AMT_2d, train_labels)*100))
print('Test score is: {0:.2f}%'.format(knn_mdl.score(test_AMT_2d, test_labels)*100))

Training score is:  70.72%
Test score is: 62.89%


# Extra Trees Classifier

In [35]:
from sklearn.ensemble import ExtraTreesClassifier
#ExtraTreesClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#                     min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=False, 
#                     oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)

extree_dict = {}
for i in range(1,40):
    extree_clf = ExtraTreesClassifier(n_estimators=i)
    extree_mdl = extree_clf.fit(train_OCAT_2d, train_labels)
    extree_dict[i] = '{0:.2f}%'.format(extree_mdl.score(test_OCAT_2d, test_labels)*100)

In [36]:
max(extree_dict.values())

'63.73%'

In [37]:
print('Training score is:  {0:.2f}%'.format(extree_mdl.score(train_OCAT_2d, train_labels)*100))
print('Test score is: {0:.2f}%'.format(extree_mdl.score(test_OCAT_2d, test_labels)*100))

Training score is:  100.00%
Test score is: 63.31%


# SVM Classifier

In [20]:
svc_clf = SVC()
svc_mdl = svc_clf.fit(train_AMT_2d, train_labels)

In [17]:
c = np.array([1., 10., 50., 100., 500., 1000.])
gamma = np.array([0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1])

#class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False,
#            tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape=None, 
_ ='''
svd_dict = {}

for x in range(len(c)):
    for y in range(len(gamma)):
        svc_clf = SVC()
        svc_mdl = svc_clf.fit(train_CustGray_2d, train_labels)
        svd_dict[(c[x],gamma[y])] = svc_mdl.score(test_CustGray_2d, test_labels)*100
'''

In [21]:
print('Training score is:  {0}%'.format(svc_mdl.score(train_AMT_2d, train_labels)*100))
print('Test score is: {0}%'.format(svc_mdl.score(test_AMT_2d, test_labels)*100))

Training score is:  100.0%
Test score is: 19.55436035343834%


In [None]:
best_parameters = svc_grid.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))
    
print('Training score is', svc_mdl.score(train_data, train_labels))
print('Test score is', svc_mdl.score(test_data, test_labels))
print('-'*80)
print('Classification report of training data:\n', classification_report(train_labels, svc_mdl.predict(train_data)))
print('-'*80)
print('Classification report of test data:\n', classification_report(test_labels, svc_mdl.predict(test_data)))

# Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
param_grid = {
            'penalty': ['l2'],
            'class_weight': ['balanced'],
            'solver': ['newton-cg', 'lbfgs', 'sag'],
            'multi_class': ['ovr', 'multinomial']
            }


#logr_grid = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, shuffle=True, verbose=0, 
#                    n_jobs=4, random_state=None, learning_rate='optimal')

logr_grid = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid)
logr_mdl = logr_grid.fit(train_CustGray_2d, train_labels)

In [None]:
print('Training score is', logr_mdl.score(train_data, train_labels))
print('Validation score is', logr_mdl.score(test_data, test_labels))
print('-'*80)
print('Classification report of training data:\n', classification_report(train_labels, logr_mdl.predict(train_data)))
print('-'*80)
print('Classification report of validation data:\n', classification_report(test_labels, logr_mdl.predict(test_data)))

# Decision Tree Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
pipeline = Pipeline([('clf',DecisionTreeClassifier(criterion='entropy',random_state=20000))])

parameters = {
            'criterion': ['gini', 'entropy'],
            'clf__max_depth': (5, 10, 25, 50, 100),
            'clf__min_samples_split': (1, 2, 3, 4),
            'clf__min_samples_leaf': (1, 2, 3, 4)
            }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1_weighted')
grid_search.fit(train_data, train_labels)

print('Best score: {0}%'.format(grid_search.best_score_*100))
print('Best parameters set:')

best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))

predictions = grid_search.predict(test_data)
print(classification_report(test_labels, predictions))