In [47]:
%matplotlib inline
# packages to load 
# Check the versions of libraries
# Python version
import warnings
warnings.filterwarnings('ignore')
import sys
print('Python: {}'.format(sys.version))

import scipy
print('scipy: {}'.format(scipy.__version__))

import csv
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

# Importing metrics for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import random as rn
import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers
from keras import initializers
from keras.callbacks import (TensorBoard, EarlyStopping)

import math

TRAIN_FILE_PATH = "data/X_train.csv"
TARGET_FILE_PATH =  "data/y_train.csv"
TEST_FILE_PATH = "data/X_test.csv"

seed=42
np.random.seed(seed)
rn.seed(seed)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)



# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)



Python: 3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 17:14:51) 
[GCC 7.2.0]
scipy: 1.1.0


In [2]:
#Load train and test set

train_data = pd.read_csv(TRAIN_FILE_PATH)
train_data.drop("id", axis=1, inplace=True)

Y_train = pd.read_csv(TARGET_FILE_PATH)
Y_train.drop("id", axis=1, inplace = True)

test_data =  pd.read_csv(TEST_FILE_PATH)
id_test = test_data.columns[0]
test_data.drop("id", axis=1, inplace=True)

In [57]:
## ================ FUNCTION DEFS ================ ##

#Zero mean unit variance for train and test data
def scale_data(train, test):
    
    print("Train shape: ", train.shape)
    print("Test shape: ",test.shape)
    
    scaler = StandardScaler().fit(train, Y_train)
    train = scaler.transform(train)
    test = scaler.transform(test)
   
    return train, test

def make_submission(filename, predictions):
    test_data =  pd.read_csv(TEST_FILE_PATH)
    test_data["y"] = predictions
    test_data[["id", "y"]].to_csv("submissions/"+filename, index= False)
    
def as_keras_metric(method):
    import functools
    from keras import backend as K
    import tensorflow as tf
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy
    
    Variables:
        weights: numpy array of shape (C,) where C is the number of classes
    
    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """
    
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    
    return loss

@as_keras_metric
def bmac_metric(Y_true, Y_pred):
    return tf.metrics.mean_per_class_accuracy(Y_true, Y_pred, 3)


# define baseline model
def baseline_model(weights):
    # create model
    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    #model.add(LeakyReLU(alpha=0.1))
    # model.add(Dropout(rate = dropout))

    #model.add(Dense(1, kernel_regularizer = regularizers.l1(lambda_reg)))
    
    
    nn = Sequential()
    nn.add(Dense(512, input_dim=1000, activation='relu'))
    nn.add(Dense(256, activation='relu'))
    nn.add(Dense(128, activation='relu'))
    nn.add(Dense(64,activation='relu'))
    nn.add(Dense(3, activation='softmax'))
    # Compile model
   

    nn.compile(loss=weighted_categorical_crossentropy(weights), 
               optimizer=optimizer, 
               metrics=[bmac_metric])
    nn.summary()
    return nn

In [4]:
X_train_scaled, X_test_scaled = scale_data(train_data, test_data)

Train shape:  (4800, 1000)
Test shape:  (4100, 1000)


# Softmax

In [39]:
BMAC = []

class_weights = np.array([])

# encode class values as integers
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(Y_train.values)
encoded_Y = encoder.transform(Y_train.values).toarray()

num_epochs = 25




kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = encoded_Y[train]
    
    # compute the class weights
    class_weights = np.array([np.sum(Y_train.values == 0) / X_train_scaled.shape[0], 
                          np.sum(Y_train.values == 1) / X_train_scaled.shape[0], 
                          np.sum(Y_train.values == 2) / X_train_scaled.shape[0] ])
    
    model = baseline_model(class_weights)
    # Fit the model
    model.fit(x=X, y=Y, epochs=num_epochs, verbose=0, validation_split=0.05, shuffle=True, 
              steps_per_epoch=100, initial_epoch=0, validation_steps=5)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = [[p] for p in  model.predict_classes(X_test)]
    
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    print(f"Current BMAC score {cur_BMAC}")
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))

[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_43 (Dense)             (None, 20)                20020     
_________________________________________________________________
dense_44 (Dense)             (None, 3)                 63        
Total params: 20,083
Trainable params: 20,083
Non-trainable params: 0
_________________________________________________________________
Train on 4104 samples, validate on 216 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.5601851851851851
[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape           

Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.6203703703703703
[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_47 (Dense)             (None, 20)                20020     
_________________________________________________________________
dense_48 (Dense)             (None, 3)                 63        
Total params: 20,083
Trainable params: 20,083
Non-trainable params: 0
_________________________________________________________________
Train on 4104 samples, validate on 216 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.5722222222222222
[0.125 0.75  0.125]
________________________________

Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.6222222222222222
[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_51 (Dense)             (None, 20)                20020     
_________________________________________________________________
dense_52 (Dense)             (None, 3)                 63        
Total params: 20,083
Trainable params: 20,083
Non-trainable params: 0
_________________________________________________________________
Train on 4104 samples, validate on 216 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.610185185

Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.6342592592592592
[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_55 (Dense)             (None, 20)                20020     
_________________________________________________________________
dense_56 (Dense)             (None, 3)                 63        
Total params: 20,083
Trainable params: 20,083
Non-trainable params: 0
_________________________________________________________________
Train on 4104 samples, validate on 216 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 

Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.6064814814814815
[0.125 0.75  0.125]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_59 (Dense)             (None, 20)                20020     
_________________________________________________________________
dense_60 (Dense)             (None, 3)                 63        
Total params: 20,083
Trainable params: 20,083
Non-trainable params: 0
_________________________________________________________________
Train on 4104 samples, validate on 216 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/

Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Current BMAC score 0.5703703703703704
AVG: BMAC score: 0.6006 (+/- 0.0258)
Process finsihed!


In [62]:
# use all data for training
class_weights = np.array([np.sum(Y_train.values == 0) / X_train_scaled.shape[0], 
                          np.sum(Y_train.values == 1) / X_train_scaled.shape[0], 
                          np.sum(Y_train.values == 2) / X_train_scaled.shape[0] ])

model = baseline_model(class_weights)

# tf board call back
tbCallBack = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)
earlyStop = EarlyStopping(monitor='val_bmac_metric', min_delta=0.0001, patience=50, verbose=1, mode='auto')


# Fit the model
model.fit(x=X_train_scaled, y=encoded_Y, epochs=60, verbose=1, validation_split=0.05, shuffle=True, 
          steps_per_epoch=100, initial_epoch=0, validation_steps=5, 
         callbacks=[tbCallBack])

pred = model.predict_classes(X_test_scaled)
make_submission('ax_SOFTMAX.csv', pred)
print("Process finsihed!")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_111 (Dense)            (None, 512)               512512    
_________________________________________________________________
dense_112 (Dense)            (None, 256)               131328    
_________________________________________________________________
dense_113 (Dense)            (None, 128)               32896     
_________________________________________________________________
dense_114 (Dense)            (None, 64)                8256      
_________________________________________________________________
dense_115 (Dense)            (None, 3)                 195       
Total params: 685,187
Trainable params: 685,187
Non-trainable params: 0
_________________________________________________________________
Train on 4560 samples, validate on 240 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
E

Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Process finsihed!


# K-Nearest Neighbours

In [21]:
# K-Nearest Neighbours
from sklearn.neighbors import KNeighborsClassifier
BMAC = []

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = Y_train.values[train]
    model = KNeighborsClassifier(n_neighbors=8)
    model.fit(X, Y)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = model.predict(X_test)

    # Summary of the predictions made by the classifier
    #print(classification_report(X_test, Y_pred))
    #print(confusion_matrix(Y_test, Y_pred))
    # Accuracy score
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))
pred = model.predict(X_test_scaled)
make_submission('ax_knn_8.csv', pred)

Balanced Multi Class Accuracy is 0.5481481481481482
Balanced Multi Class Accuracy is 0.5481481481481482
Balanced Multi Class Accuracy is 0.5675925925925926
Balanced Multi Class Accuracy is 0.5750000000000001
Balanced Multi Class Accuracy is 0.5944444444444444
Balanced Multi Class Accuracy is 0.5805555555555556
Balanced Multi Class Accuracy is 0.5425925925925926
Balanced Multi Class Accuracy is 0.5472222222222222
Balanced Multi Class Accuracy is 0.5907407407407407
Balanced Multi Class Accuracy is 0.5157407407407408
AVG: BMAC score: 0.5610 (+/- 0.0235)


# Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB
BMAC = []

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = Y_train.values[train]
    model = GaussianNB()
    model.fit(X, Y)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = model.predict(X_test)

    # Summary of the predictions made by the classifier
    #print(classification_report(X_test, Y_pred))
    #print(confusion_matrix(Y_test, Y_pred))
    # Accuracy score
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))
pred = model.predict(X_test_scaled)
make_submission('ax_naive_bayes.csv', pred)

Balanced Multi Class Accuracy is 0.575925925925926
Balanced Multi Class Accuracy is 0.6388888888888888
Balanced Multi Class Accuracy is 0.6055555555555556
Balanced Multi Class Accuracy is 0.6175925925925926
Balanced Multi Class Accuracy is 0.6166666666666667
Balanced Multi Class Accuracy is 0.6694444444444444
Balanced Multi Class Accuracy is 0.5953703703703704
Balanced Multi Class Accuracy is 0.5638888888888889
Balanced Multi Class Accuracy is 0.6092592592592593
Balanced Multi Class Accuracy is 0.6277777777777778
AVG: BMAC score: 0.6120 (+/- 0.0288)


# Support Vector Machine

In [28]:
from sklearn.svm import SVC
BMAC = []

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = Y_train.values[train]
    model = SVC()
    model.fit(X, Y)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = model.predict(X_test)

    # Summary of the predictions made by the classifier
    #print(classification_report(X_test, Y_pred))
    #print(confusion_matrix(Y_test, Y_pred))
    # Accuracy score
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))
pred = model.predict(X_test_scaled)
make_submission('ax_SVC.csv', pred)

AVG: BMAC score: 0.5726 (+/- 0.0234)


# MLP Classifier

In [30]:
from sklearn.neural_network import MLPClassifier
BMAC = []

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = Y_train.values[train]
    model = MLPClassifier()
    model.fit(X, Y)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = model.predict(X_test)

    # Summary of the predictions made by the classifier
    #print(classification_report(X_test, Y_pred))
    #print(confusion_matrix(Y_test, Y_pred))
    # Accuracy score
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))
pred = model.predict(X_test_scaled)
make_submission('ax_MLP.csv', pred)

AVG: BMAC score: 0.6327 (+/- 0.0214)


# RandomForestClassifier

In [33]:
from sklearn.ensemble import RandomForestClassifier
BMAC = []

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
for train, test in kfold.split(X_train_scaled, Y_train.values):
    X = X_train_scaled[train]
    Y = Y_train.values[train]
    model = RandomForestClassifier()
    model.fit(X, Y)
    
    X_test = X_train_scaled[test]
    Y_true = Y_train.values[test]
    Y_pred = model.predict(X_test)

    # Summary of the predictions made by the classifier
    #print(classification_report(X_test, Y_pred))
    #print(confusion_matrix(Y_test, Y_pred))
    # Accuracy score
    cur_BMAC = balanced_accuracy_score(Y_true, Y_pred)
    BMAC.append(cur_BMAC)
    
print("AVG: BMAC score: %.4f (+/- %.4f)" % (np.mean(BMAC), np.std(BMAC)))
pred = model.predict(X_test_scaled)
make_submission('ax_RFC.csv', pred)

AVG: BMAC score: 0.5089 (+/- 0.0234)
