In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import classification_report, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import os
from skimage.transform import resize
from skimage import io
import cv2
import imageio
import pickle
import warnings
from tqdm import tqdm
import multiprocessing as multi

In [2]:
###  Load Data from Pickled Files  ###
neg_df = pd.read_pickle('neg_df.pkl')
pos_df = pd.read_pickle('pos_df.pkl')

In [3]:
###  Data splitting and preparing for modeling  ###
   ###  Don't run until dataset is assembled  ###

#make balanced classes by sampling the negative dataset
sliced_neg = neg_df.sample(pos_df.shape[0])
df = pos_df.append(sliced_neg)

In [4]:
# for i in range(3072):
#     df[i]=df[i].astype('float64')
# df.dtypes

In [5]:
X_unscaled = df.drop(columns = 'Edel', axis=1)
y = df.Edel
scaler = StandardScaler()
X = scaler.fit_transform(X_unscaled)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, train_size = 0.7)

In [6]:
###  Run this cell in AWS, takes forever and clogs memory  ###

# options = {'degree':[1,2,3,4,5], 'C':np.logspace(-3,2,25)}
# clf = GridSearchCV(svm.SVC(kernel='poly'), param_grid=options, cv=5, n_jobs=-1)
# clf.fit(X_train,y_train)

In [None]:
polysvc = svm.SVC(kernel = 'poly', degree = 2)
polysvc.fit(X_train,y_train)
print(polysvc.score(X_test,y_test))
print('polysvm', classification_report(y_test, polysvc.predict(X_test)))

In [None]:
pickle.dump(polysvc,'topmodel.pkl')

In [10]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train,y_train)
print(logreg.score(X_test,y_test))
print('logreg', classification_report(y_test, logreg.predict(X_test)))

0.9305348978130337
logreg              precision    recall  f1-score   support

          0       0.90      0.97      0.93      9091
          1       0.96      0.90      0.93      9062

avg / total       0.93      0.93      0.93     18153



In [11]:
lsvc = svm.LinearSVC(C=0.001)
lsvc.fit(X_train,y_train)
print(lsvc.score(X_test,y_test))
print('lsvm', classification_report(y_test, lsvc.predict(X_test)))

0.9345562716906296
lsvm              precision    recall  f1-score   support

          0       0.90      0.98      0.94      9091
          1       0.98      0.89      0.93      9062

avg / total       0.94      0.93      0.93     18153



In [12]:
svc = svm.SVC(kernel = 'rbf', gamma = 0.0001)
svc.fit(X_train,y_train)
print(svc.score(X_test,y_test))
print('rbfsvm', classification_report(y_test, svc.predict(X_test)))

0.9953175783617033
rbfsvm              precision    recall  f1-score   support

          0       0.99      1.00      1.00      9091
          1       1.00      0.99      1.00      9062

avg / total       1.00      1.00      1.00     18153



In [None]:
###  Experimentation with CNNs and Keras  ###
###  Code taken from Keras Documentation  ###

from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K

# create the base pre-trained model
base_model = InceptionV3(weights='imagenet', include_top=False)

# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(2, activation='softmax')(x)

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# train the model on the new data for a few epochs
model.fit_generator()

# at this point, the top layers are well trained and we can start fine-tuning
# convolutional layers from inception V3. We will freeze the bottom N layers
# and train the remaining top layers.

# let's visualize layer names and layer indices to see how many layers
# we should freeze:
for i, layer in enumerate(base_model.layers):
    print(i, layer.name)

# we chose to train the top 2 inception blocks, i.e. we will freeze
# the first 249 layers and unfreeze the rest:
for layer in model.layers[:249]:
    layer.trainable = False
for layer in model.layers[249:]:
    layer.trainable = True

# we need to recompile the model for these modifications to take effect
# we use SGD with a low learning rate
from keras.optimizers import SGD
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy')

# we train our model again (this time fine-tuning the top 2 inception blocks
# alongside the top Dense layers
model.fit_generator(...)