In [1]:
import os,sys
import numpy as np
import pandas as pd
import ibmseti
import collections
import scipy.io
import matplotlib.pyplot as plt
import commonutils as cu
import PIL
from PIL import Image
import sklearn
import h5py
from sklearn import svm

Using TensorFlow backend.


In [2]:
import keras
from keras import backend as K
K.set_image_dim_ordering('tf')
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.utils import np_utils

In [3]:
from sklearn.decomposition import PCA

In [4]:
def runLinSVMModel(dataset,C,nDataset,modeltype,printReports=True,gamma=None):
    x_train = dataset['x_train']
    y_train = dataset['y_train']
    x_test = dataset['x_test']
    y_test = dataset['y_test']
    
    # Scaling training and test data
    means = np.mean(x_train,axis=0)
    stddev = np.std(x_train,axis=0)
    # Preventing zero division
    stddev[stddev<1e-3] = 1
    x_train -= means
    x_train /= stddev
    x_test -= means
    x_test /= stddev
    
    if modeltype=='linSVM':
        lin_clf = svm.LinearSVC(C=C/nDataset,verbose=True,class_weight='balanced')
        lin_clf.fit(x_train, y_train)
        pred_train = lin_clf.predict(x_train)
        pred_test = lin_clf.predict(x_test)
    elif modeltype=='linSVR':
        lin_clf = svm.LinearSVC(C=C/nDataset,verbose=True)
        lin_clf.fit(x_train, y_train)
        pred_train = np.round(lin_clf.predict(x_train))
        pred_test = np.round(lin_clf.predict(x_test))
    elif modeltype=='rbfSVM':
        lin_clf = svm.SVC(C=C/nDataset,gamma=gamma,verbose=True,class_weight='balanced',
                          decision_function_shape='ovr')
        lin_clf.fit(x_train, y_train)
        pred_train = lin_clf.predict(x_train)
        pred_test = lin_clf.predict(x_test)

    train_report = sklearn.metrics.classification_report(y_train,pred_train)
    test_report = sklearn.metrics.classification_report(y_test,pred_test)

    train_confmat = sklearn.metrics.confusion_matrix(y_train,pred_train)
    test_confmat = sklearn.metrics.confusion_matrix(y_test,pred_test)
    
    if printReports:
        print train_report
        print train_confmat
        print test_report
        print test_confmat

        print("Classification accuracy: %0.2f" % sklearn.metrics.accuracy_score(y_test,pred_test) )
        print("MSE: %0.2f" % np.mean(np.square(y_test - lin_clf.predict(x_test))) )
        print("Predictions correlation: %0.2f") % np.corrcoef(y_test,pred_test,rowvar=0)[0,1]
    
    result = {'train_report':train_report,'train_confmat':train_confmat,
             'test_report':test_report,'test_confmat':test_confmat,
             'train_score':lin_clf.score(x_train,y_train),
             'test_score':lin_clf.score(x_test,y_test) }
    return result

In [5]:
def plot_cumulative_variance(pca):
    P = []
    for p in pca.explained_variance_ratio_:
        if len(P) == 0:
            P.append(p)
        else:
            P.append(p + P[-1])
    plt.plot(P)
    plt.show()
    return P

In [6]:
# Loading in a dataset with a subset of all classes
subsetClasses = {0.0:0.0,2.0:1.0,3.0:2.0,5.0:3.0}
dataset = cu.datautils.loadDataset("data/activations-5-1.h5",subsetClasses=subsetClasses)

Dim of data: 114688
Number of training images = 7211
Number of validation images = 1832
Number of test images = 943
Distribution in training images: 
0 - 1471 
1 - 1419 
2 - 2888 
3 - 1433 
4 - 0
Distribution in validation images: 
0 - 343 
1 - 376 
2 - 723 
3 - 390 
4 - 0
Distribution in test images: 
0 - 184 
1 - 202 
2 - 384 
3 - 173 
4 - 0


In [7]:
## Loading in and preparing datasets
x_train = dataset['x_train']
y_train = dataset['y_train']
x_val = dataset['x_val']
y_val = dataset['y_val']
x_test = dataset['x_test']
y_test = dataset['y_test']
num_val = dataset['x_val'].shape[0]
num_test = dataset['x_test'].shape[0]
nb_classes = 4
# Scaling training and test data
means = np.mean(x_train,axis=0)
stddev = np.std(x_train,axis=0)
# Preventing zero division
stddev[stddev<1e-3] = 1
x_train -= means
x_train /= stddev
x_val -= means
x_val /= stddev
x_test -= means
x_test /= stddev

In [8]:
## Parameter search for SVM
C_values = 10**np.random.uniform(-3,0,10)
train_scores = []
test_scores = []
for C_val in C_values:
    a = runLinSVMModel(dataset,C_val,len(dataset['y_train']),printReports=False,modeltype='linSVM')
    train_scores.append(a['train_score'])
    test_scores.append(a['test_score'])
    print "Results for C={}:".format(C_val)
    print "Train score={}   Test score={}".format(a['train_score'],a['test_score'])

[LibLinear]Results for C=0.0125764032241:
Train score=0.862432394952   Test score=0.556733828208
[LibLinear]Results for C=0.122401203647:
Train score=1.0   Test score=0.582184517497
[LibLinear]Results for C=0.0806160965178:
Train score=1.0   Test score=0.582184517497
[LibLinear]Results for C=0.00135991310012:
Train score=0.580502010817   Test score=0.498409331919
[LibLinear]Results for C=0.0244526945527:
Train score=0.958674247677   Test score=0.574761399788
[LibLinear]Results for C=0.314272072656:
Train score=1.0   Test score=0.574761399788
[LibLinear]Results for C=0.00166646363381:
Train score=0.596311191236   Test score=0.504772004242
[LibLinear]Results for C=0.297857114389:
Train score=1.0   Test score=0.577942735949
[LibLinear]Results for C=0.530394019778:
Train score=1.0   Test score=0.575821845175
[LibLinear]Results for C=0.150764541357:
Train score=1.0   Test score=0.577942735949


In [None]:
a = runLinSVMModel(dataset,1e-1,len(dataset['y_train']),modeltype='linSVM')

In [None]:
(a['train_score'],a['test_score'])

In [24]:
# 4-19, Loading in a dataset with a subset of all classes
subsetClasses = {0.0:0.0,2.0:1.0,3.0:2.0,5.0:3.0}
dataset = cu.datautils.loadDataset("data/activations-4-19.h5",subsetClasses=subsetClasses)

Dim of data: 114688
Number of training images = 7208
Number of validation images = 1866
Number of test images = 913
Distribution in training images: 
0 - 1472 
1 - 1431 
2 - 2871 
3 - 1434 
4 - 0
Distribution in validation images: 
0 - 348 
1 - 406 
2 - 742 
3 - 370 
4 - 0
Distribution in test images: 
0 - 178 
1 - 160 
2 - 382 
3 - 193 
4 - 0


In [25]:
## Loading in and preparing datasets
x_train = dataset['x_train']
y_train = dataset['y_train']
x_val = dataset['x_val']
y_val = dataset['y_val']
x_test = dataset['x_test']
y_test = dataset['y_test']
num_val = dataset['x_val'].shape[0]
num_test = dataset['x_test'].shape[0]
nb_classes = 4
# Scaling training and test data
means = np.mean(x_train,axis=0)
stddev = np.std(x_train,axis=0)
# Preventing zero division
stddev[stddev<1e-3] = 1
x_train -= means
x_train /= stddev
x_val -= means
x_val /= stddev
x_test -= means
x_test /= stddev

In [26]:
a = runLinSVMModel(dataset,1e-1,len(dataset['y_train']),modeltype='linSVM')

[LibLinear]             precision    recall  f1-score   support

        0.0       0.57      1.00      0.73      1472
        1.0       0.96      0.86      0.91      1431
        2.0       1.00      0.77      0.87      2871
        3.0       0.98      0.78      0.87      1434

avg / total       0.90      0.84      0.85      7208

[[1472    0    0    0]
 [ 197 1234    0    0]
 [ 594   45 2213   19]
 [ 319    0    0 1115]]
             precision    recall  f1-score   support

        0.0       0.48      0.98      0.64       178
        1.0       0.52      0.63      0.57       160
        2.0       0.87      0.54      0.67       382
        3.0       0.89      0.53      0.66       193

avg / total       0.73      0.64      0.64       913

[[174   0   0   4]
 [ 27 101  31   1]
 [ 77  90 207   8]
 [ 87   3   1 102]]
Classification accuracy: 0.64
MSE: 1.42
Predictions correlation: 0.48


In [27]:
(a['train_score'],a['test_score'])

(0.83712541620421754, 0.63964950711938662)

In [32]:
pca = PCA(n_components=1000)
pca.fit(dataset['x_train'])
dataset_pca = {}
dataset_pca['x_train'] = pca.transform(dataset['x_test'])
dataset_pca['x_test'] = pca.transform(x_test)
dataset_pca['y_test'] = dataset['y_test']
dataset_pca['y_train']= dataset['y_train']
dataset_pca['x_train'].shape
dataset_pca['x_test'].shape

MemoryError: 

In [None]:
a = runLinSVMModel(dataset_pca,1e-1,len(dataset['y_train']),modeltype='linSVM')

In [None]:
# ## Read in all files from directory and combine them into train/val/test datasets
# dataset = cu.datautils.loadDataset("data/activations-4-19.h5")

In [8]:
# ## Loading in and preparing datasets
# x_train = dataset['x_train']
# y_train = dataset['y_train']
# x_val = dataset['x_val']
# y_val = dataset['y_val']
# x_test = dataset['x_test']
# y_test = dataset['y_test']
# num_val = dataset['x_val'].shape[0]
# num_test = dataset['x_test'].shape[0]
# nb_classes = 7

# # Scaling training and test data
# means = np.mean(x_train,axis=0)
# stddev = np.std(x_train,axis=0)
# # Preventing zero division
# stddev[stddev<1e-3] = 1
# x_train -= means
# x_train /= stddev
# x_val -= means
# x_val /= stddev
# x_test -= means
# x_test /= stddev

MemoryError: 

In [4]:
## Loading in and preparing datasets
x_train = dataset['x_train']
y_train = dataset['y_train']
x_val = dataset['x_val']
y_val = dataset['y_val']
x_test = dataset['x_test']
y_test = dataset['y_test']
num_val = dataset['x_val'].shape[0]
num_test = dataset['x_test'].shape[0]
nb_classes = 7

# Scaling training and test data
means = np.mean(x_train,axis=0)
stddev = np.std(x_train,axis=0)
# Preventing zero division
stddev[stddev<1e-3] = 1
x_train -= means
x_train /= stddev
x_val -= means
x_val /= stddev
x_test -= means
x_test /= stddev