# Multilayer Perceptron w Pre Processing


In [1]:
# global
import pandas as pd
import numpy as np
import random

# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# classification
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import GridSearchCV

import csv
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import KFold

import matplotlib.pyplot as plt
import matplotlib as matplotlib
matplotlib.rcParams['backend'] = "Qt4Agg"

seed = 150389

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# import data from files
train_df = pd.read_csv("training.csv", delimiter=",", header=0)
test_df = pd.read_csv("testing.csv", delimiter=",", header=0)
confidence_df = pd.read_csv("annotation_confidence.csv", delimiter=",", header=0)
additional_df = pd.read_csv("additional_training.csv", delimiter=",", header=0)
proportions_df = pd.read_csv("test_proportions.csv", delimiter=",", header=None, dtype="unicode")

In [3]:
# concatenate the training and additional data
all_df = pd.concat([train_df, additional_df], ignore_index=True)

# get lists of confident and unconfident IDs
unconfident_list = confidence_df[confidence_df.confidence != 1]['ID'].tolist()
confident_list = confidence_df[confidence_df.confidence == 1]['ID'].tolist()

In [4]:
confidence = []
with open('../annotation_confidence.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=",")
    #confidence={int(row['ID'])-1:int(float(row['confidence'])) for row in reader} #float(row['confidence'])
    for row in reader:
        confidence.append(float(row['confidence']))
#confidence

# Fill NaN

## Option1:Fill with [ALL] average

## Option2:Fill with [Confidence and Prediction] averages
- Split data two fold based on confidence and prediction
    - Confident prediction 0
    - Confident prediction 1
    - Unconfident prediction 0
    - Unconfident prediction 1
- Fill NaN of each with mean
- Update the additional dataframe with new data (filled nan values)

In [5]:
# get confident all 0 and 1 prediction averages
confident_all_df = all_df[all_df['ID'].isin(confident_list)]
confident_0_all_df = confident_all_df[confident_all_df.prediction == 0]
confident_1_all_df = confident_all_df[confident_all_df.prediction == 1]
confident_0_avg = pd.DataFrame(confident_0_all_df.mean(axis=0)[1:-1]).T
confident_1_avg = pd.DataFrame(confident_1_all_df.mean(axis=0)[1:-1]).T
# get unconfident all 0 and 1 prediction averages
unconfident_all_df =  all_df[all_df['ID'].isin(unconfident_list)]
unconfident_0_all_df = unconfident_all_df[unconfident_all_df.prediction == 0]
unconfident_1_all_df = unconfident_all_df[unconfident_all_df.prediction == 1]
unconfident_0_avg = pd.DataFrame(unconfident_0_all_df.mean(axis=0)[1:-1]).T
unconfident_1_avg = pd.DataFrame(unconfident_1_all_df.mean(axis=0)[1:-1]).T

# fill confident additional data nan values with averages
confident_additional_df = additional_df[additional_df['ID'].isin(confident_list)].copy()
confident_0_additional_df = confident_additional_df[confident_additional_df.prediction == 0]
confident_1_additional_df = confident_additional_df[confident_additional_df.prediction == 1]
confident_0_additional_df = confident_0_additional_df.fillna(value=confident_0_avg.iloc[0])
confident_1_additional_df = confident_1_additional_df.fillna(value=confident_1_avg.iloc[0])
# fill unconfident additional data nan values with averages
unconfident_additional_df = additional_df[additional_df['ID'].isin(unconfident_list)].copy()
unconfident_0_additional_df = unconfident_additional_df[unconfident_additional_df.prediction == 0]
unconfident_1_additional_df = unconfident_additional_df[unconfident_additional_df.prediction == 1]
unconfident_0_additional_df = unconfident_0_additional_df.fillna(value=unconfident_0_avg.iloc[0])
unconfident_1_additional_df = unconfident_1_additional_df.fillna(value=unconfident_1_avg.iloc[0])

# update confident additional dataframe with new confident 0 and 1 values (filled nan values)
confident_additional_df.update(confident_0_additional_df)
confident_additional_df.update(confident_1_additional_df)
# update unconfident additional dataframe with new unconfident 0 and 1 values (filled nan values)
unconfident_additional_df.update(unconfident_0_additional_df)
unconfident_additional_df.update(unconfident_1_additional_df)

# update main additional dataframe with new confident and unconfident values 
additional_df.update(confident_additional_df)
additional_df.update(unconfident_additional_df)

# Update dataset

In [6]:
# re-create main dataframe with filled nan values
all_df = pd.concat([train_df,additional_df], ignore_index=True)

# Gather Data

In [7]:
# true training and testing data
raw_train_X = pd.DataFrame(all_df.drop(['ID', 'prediction'], axis=1))
raw_train_y = pd.DataFrame(all_df['prediction'])
raw_test_X = pd.DataFrame(test_df.drop(['ID'], axis=1))

In [8]:
# splits dataframe into feature sets
def split_features(df):
    CNN_features = pd.DataFrame(df[col] for col in df if 'CNN' in col).T
    GIST_features = pd.DataFrame(df[col] for col in df if 'GIST' in col).T
    return CNN_features, GIST_features

In [9]:
# produces IDs for 20% of the data chosen randomly
count = 0
rand_rows = []
while count <= len(raw_train_X)*0.2:
    r = random.randint(0, len(raw_train_X)-1)
    if(not r in rand_rows): #ensure value not already in array
        rand_rows.append(r)
        count+=1 #only increase if new value found
rand_rows.sort() #visual representation only

# split training data into its own training and testing data to test accuracy
train_train_X  = pd.DataFrame(all_df[~all_df['ID'].isin([rn+1 for rn in rand_rows])].drop(['ID'], axis=1)) #75% of the data randomly selected for training
train_train_y  = pd.DataFrame(train_train_X['prediction']) #80% of the predictions for training
train_train_X = train_train_X.drop(['prediction'], axis=1)

test_train_X   = pd.DataFrame(all_df[all_df['ID'].isin([rn+1 for rn in rand_rows])].drop(['ID'], axis=1)) #25% of the data randomly selected for testing
test_train_y   = pd.DataFrame(test_train_X['prediction']) #20% of the preditions to test accuracy
test_train_X = test_train_X.drop(['prediction'], axis=1)

# split the featuresets of each for preprocessing
train_X_CNN, train_X_GIST = split_features(train_train_X)
test_X_CNN, test_X_GIST = split_features(test_train_X)
raw_train_X_CNN, raw_train_X_GIST = split_features(raw_train_X)
raw_test_X_CNN, raw_test_X_GIST = split_features(raw_test_X)

# Pre-Processing

In [10]:
t1 = train_X_CNN[:2]

# Standardization
sc_cnn=StandardScaler()
train_X_CNN = pd.DataFrame(sc_cnn.fit_transform(train_X_CNN))
test_X_CNN = pd.DataFrame(sc_cnn.transform(test_X_CNN))
sc_gist=StandardScaler()
train_X_GIST = pd.DataFrame(sc_gist.fit_transform(train_X_GIST))
test_X_GIST = pd.DataFrame(sc_gist.transform(test_X_GIST))
raw_sc_cnn=StandardScaler()
raw_train_X_CNN = pd.DataFrame(sc_cnn.fit_transform(raw_train_X_CNN))
raw_test_X_CNN = pd.DataFrame(sc_cnn.transform(raw_test_X_CNN))
raw_sc_gist=StandardScaler()
raw_train_X_GIST = pd.DataFrame(sc_cnn.fit_transform(raw_train_X_GIST))
raw_test_X_GIST = pd.DataFrame(sc_cnn.transform(raw_test_X_GIST))

t2 = train_X_CNN[:2]

variance = 0.95

# Principle Component Analysis - reduce dimensionality
pca_cnn = PCA(variance)
train_X_CNN = pd.DataFrame(pca_cnn.fit_transform(train_X_CNN))
test_X_CNN = pd.DataFrame(pca_cnn.transform(test_X_CNN))
pca_gist = PCA(variance)
train_X_GIST = pd.DataFrame(pca_gist.fit_transform(train_X_GIST))
test_X_GIST = pd.DataFrame(pca_gist.transform(test_X_GIST))
raw_pca_cnn = PCA(variance)
raw_train_X_CNN = pd.DataFrame(raw_pca_cnn.fit_transform(raw_train_X_CNN))
raw_test_X_CNN = pd.DataFrame(raw_pca_cnn.transform(raw_test_X_CNN))
raw_pca_gist = PCA(variance)
raw_train_X_GIST = pd.DataFrame(raw_pca_gist.fit_transform(raw_train_X_GIST))
raw_test_X_GIST = pd.DataFrame(raw_pca_gist.transform(raw_test_X_GIST))

t3 = train_X_CNN[:2]

display(t1, t2, t3)

# Rejoin after standardisation and pca
train_X = pd.concat([train_X_CNN.T, train_X_GIST.T]).T
test_X = pd.concat([test_X_CNN.T, test_X_GIST.T]).T
train_y = train_train_y
test_y = test_train_y
raw_train_X = pd.concat([raw_train_X_CNN.T, raw_train_X_GIST.T]).T
raw_test_X = pd.concat([raw_test_X_CNN.T, raw_test_X_GIST.T]).T

Unnamed: 0,CNNs,CNNs.1,CNNs.2,CNNs.3,CNNs.4,CNNs.5,CNNs.6,CNNs.7,CNNs.8,CNNs.9,...,CNNs.4086,CNNs.4087,CNNs.4088,CNNs.4089,CNNs.4090,CNNs.4091,CNNs.4092,CNNs.4093,CNNs.4094,CNNs.4095
3,0.1349,0.10963,0.0,0.0,0.0,0.15531,0.90697,0.0,0.0,0.05017,...,0.50117,1.2053,0.0,0.018203,0.0,0.0,0.46354,0.034842,0.0,0.0
4,0.0,0.25908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2483,...,0.3087,0.0,0.0,0.15407,0.0,0.0,0.0,0.10123,0.090716,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095
0,-0.018498,-0.312467,-0.296098,-0.365514,-0.350873,-0.101281,1.465069,-0.613027,-0.614757,-0.6756,...,0.417071,2.620707,-0.458957,-0.556035,-0.288868,-0.513946,0.121359,-0.340261,-0.529798,-0.313022
1,-0.499521,0.014733,-0.296098,-0.365514,-0.350873,-0.53872,-0.743173,-0.613027,-0.614757,-0.272096,...,0.012707,-0.620012,-0.458957,-0.164007,-0.288868,-0.513946,-0.7752,-0.132216,-0.242797,-0.313022


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177
0,2.434089,-0.607244,19.977936,1.257532,7.033781,-19.318415,5.053727,8.468327,-13.054121,-10.514458,...,-0.841866,0.059119,-0.051674,0.541086,-0.814655,-0.334156,0.705097,-0.92682,-0.250778,0.578427
1,-8.244583,8.942845,-21.509884,7.222161,-5.649318,-4.680635,0.570581,-8.577338,3.402354,3.877572,...,0.037243,-0.367154,0.316129,0.493634,0.054989,-0.355752,-0.825594,-0.345571,0.140269,-0.478581


In [11]:
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)
print(raw_test_X.shape)

(2071, 1472)
(2071, 1)
(519, 1472)
(519, 1)
(2818, 1665)


In [12]:
train_y[:3]

Unnamed: 0,prediction
3,1.0
4,1.0
6,1.0


In [13]:
class_weight = {}
for i in range(0, len(train_y)):
    if int(train_y.iloc[i]) == 0:
        class_weight[i] = {0:confidence[i], 1:1-confidence[i]}
    else:
        class_weight[i] = {0:1-confidence[i], 1:confidence[i]}
#print(confidence[:5])
#print(class_weight)

# Classification

## Option1: Train model on training split

## Option2: Train model on raw training

In [14]:
# create function for MLP classifier
def create_model(neurons1, neurons2):
    model = Sequential()
    model.add(Dense(neurons1, input_dim=len(raw_train_X.iloc[0]), activation='sigmoid'))
    model.add(Dense(neurons2, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer="Adam", metrics=['accuracy'])
    return model
    #model.summary()

# create keras classifier
estimator = KerasClassifier(build_fn=create_model, verbose=0)

In [15]:
epochs = [10]
batch_size = [25]
neurons1 = [64]
neurons2 = [64]
param_grid = dict(epochs=epochs, batch_size=batch_size, neurons1=neurons1, neurons2=neurons2)
grid = GridSearchCV(estimator=estimator, param_grid = param_grid, n_jobs=1, return_train_score=True)

In [16]:
grid_result = grid.fit(raw_train_X, raw_train_y)

In [17]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.800000 using {'batch_size': 25, 'epochs': 10, 'neurons1': 64, 'neurons2': 64}
0.800000 (0.007593) with: {'batch_size': 25, 'epochs': 10, 'neurons1': 64, 'neurons2': 64}


In [18]:
prediction = grid.predict(raw_test_X)

In [19]:
submission = pd.DataFrame(test_df['ID'])
submission['prediction'] = [int(x[0]) for x in prediction]
submission[:5]

Unnamed: 0,ID,prediction
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0


In [20]:
#submission.to_csv('outputpredictions.csv', index=False)

In [None]:
#Kaggle results: CameL / cl497