In [85]:
%matplotlib inline
import csv
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import (RandomForestRegressor,RandomForestClassifier, IsolationForest)
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import f_regression, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import random as rn
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import StratifiedKFold
import math
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import chi2


#Keras import

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import *
from keras.layers import Dropout
from keras import regularizers
from keras import initializers
from keras.losses import *


TRAIN_FILE_PATH = "data/X_train.csv"
TARGET_FILE_PATH =  "data/y_train.csv"
TEST_FILE_PATH = "data/X_test.csv"

seed=42
np.random.seed(seed)
rn.seed(seed)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)



# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)



In [86]:
#Load train and test set

X_train = pd.read_csv(TRAIN_FILE_PATH)
X_train.drop(X_train.columns[0], axis=1, inplace=True)

Y_train = pd.read_csv(TARGET_FILE_PATH)
Y_train.drop(Y_train.columns[0], axis=1, inplace = True)

X_test =  pd.read_csv(TEST_FILE_PATH)
id_test = X_test.columns[0]
X_test.drop(X_test.columns[0], axis=1, inplace=True)

In [87]:
#Helper functions

def from_class_to_vec(y_list):

    Y_vec_list = []
    print("Length of list passed: ",len(y_list))
    
    for value in y_list:
        if value == 0.0:
            Y_vec_list.append([1,0,0])
        if value == 1.0:
            Y_vec_list.append([0,1,0])
        if value == 2.0:
            Y_vec_list.append([0,0,1])

    #print("Example binary vector ",Y_vec_0_vs_all)
    return np.array(Y_vec_list)


def count_labels(labels_list):
    zeros = 0
    ones = 0
    twos = 0
    for value in labels:
        if value == 0:
            zeros = zeros + 1
        if value == 1:
            ones = ones + 1
        if value == 2:
            twos = twos + 1
        
    print(zeros)
    print(ones)
    print(twos)
    
def make_submission(filename, predictions):
    test_data =  pd.read_csv(TEST_FILE_PATH)
    test_data["y"] = predictions
    test_data[["id", "y"]].to_csv("submissions/"+filename, index= False)

In [88]:
#Y_train_new = Y_train_balanced['y'].tolist()
Y_vec_train = from_class_to_vec(Y_train['y'].tolist())

print("Probabilistc vector Y shape: ",Y_vec_train.shape)

Length of list passed:  4800
Probabilistc vector Y shape:  (4800, 3)


In [89]:
#Zero mean unit variance for train and test data
def scale_data(train, test):
    
    print("Train shape: ", train.shape)
    print("Test shape: ",test.shape)
    
    scaler = StandardScaler().fit(train, Y_train)
    train = scaler.transform(train)
    test = scaler.transform(test)
   
    return train, test

In [90]:
#X_train_balanced_scaled, X_test_scaled = scale_data(X_train_balanced, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

Train shape:  (4800, 1000)
Test shape:  (4100, 1000)


In [91]:
import math

input_dimensions = 1000
lambda_reg = 0.1
dropout = 0


def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

def weighted_loss(y_true, y_pred):
    from keras import backend as K
    #weights = [0.7,1,0.7]
    weights = [2,1,2]    
    weights = K.variable(value=weights)
    return K.sum(K.exp((y_true - y_pred)* weights))

def bmac(y_true, y_pred):
    y_true_labels = K.cast(x = K.argmax(y_true, axis = -1),dtype = 'float32')
    y_pred_labels = K.cast(x = K.argmax(y_pred, axis = -1),dtype = 'float32')

    # reshape in case it's in shape (num_samples, 1) instead of (num_samples,)
    #if K.ndim(y_true) == K.ndim(y_pred):
    #    y_true = K.squeeze(y_true, -1)
    # convert dense predictions to labels
    #y_pred_labels = K.argmax(y_pred, axis=-1)
    #y_pred_labels = K.cast(y_pred_labels, K.floatx())
    #y_true_labels = K.argmax(y_true, axis=-1)    
    #y_pred_labels = K.cast(y_true_labels, K.floatx())
    
    return tf.metrics.mean_per_class_accuracy(y_true_labels, y_pred_labels, num_classes = 3)

"""

def exponetial_loss(y_true, y_pred)
    #loss = sum(loss_vec)
    loss = math.exp(y_true[0]-y_pred[0]*weights[0])+math.exp(y_true[1]-y_pred[1]*weights[1])+math.exp(y_true[2]-y_pred[2]*weights[2])
    return loss
"""

optimizer = Adam(lr=0.001)
print("Input dimensions: ", input_dimensions)



model = Sequential()
model.add(Dense(300, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))
model.add(Dense(300, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))
model.add(Dense(300, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))
model.add(Dense(300, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))
"""model.add(Dense(500, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))
model.add(Dense(500, input_dim= input_dimensions, kernel_regularizer = regularizers.l2(lambda_reg)))
model.add(LeakyReLU(alpha=1))
model.add(Dropout(rate = dropout))"""


    
model.add(Dense(3, kernel_regularizer = regularizers.l2(lambda_reg), activation = 'softmax'))

model.compile(loss=weighted_loss, optimizer=optimizer, metrics=['accuracy'])#[bmac])
    

Input dimensions:  1000


In [92]:
num_epochs = 20

model.fit(x=X_train_scaled, y=Y_vec_train, epochs=num_epochs, verbose=1, validation_split=0.1, shuffle=True, 
          steps_per_epoch=50, initial_epoch=0, validation_steps=5)

Train on 4320 samples, validate on 480 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f43002d4470>

In [93]:
p = model.predict(X_test_scaled)

labels = []
for vec in p:
    val = np.argmax(vec)
    labels.append(val)
print(len(labels))
count_labels(labels)
print(labels)


4100
920
2554
626
[0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 1, 2, 0, 2, 1, 1, 1, 0, 2, 2, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 2, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 2, 1, 1, 0, 0, 1, 1, 2, 1, 0, 2, 1, 1, 2, 1, 2, 0, 0, 1, 0, 2, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 0, 1, 1, 0, 1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 

In [38]:
make_submission("NN_4layer_300units_weight09109_077val_acc.csv", labels)
