In [15]:
%matplotlib inline
import csv
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE, f_regression
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import random as rn
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import StratifiedKFold
import math

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.optimizers import Adam
from keras.layers import Dropout
from keras import regularizers
from keras import initializers

import warnings
warnings.filterwarnings("ignore")

TRAIN_FILE_PATH = "data/X_train.csv"
TARGET_FILE_PATH =  "data/y_train.csv"
TEST_FILE_PATH = "data/X_test.csv"

seed=42
np.random.seed(seed)
rn.seed(seed)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
                              inter_op_parallelism_threads=1)



# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see:
# https://www.tensorflow.org/api_docs/python/tf/set_random_seed

tf.set_random_seed(seed)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)



# Define all function we need

In [16]:
# fill missing values with mean column values train and test set
def fill_NaN(train, test):
    print("Filling all nan values in the data with the mean")
    train_mean_values = train.mean()
    train =  train.fillna(train_mean_values)
    test = test.fillna(train_mean_values)
    
    return train,test

def remove_outliers(X_train, Y_train):
    rng = np.random.RandomState(seed)
    isoForest = IsolationForest(behaviour='new', max_samples=100, random_state=rng, contamination='auto')
    outliers = isoForest.fit_predict(X_train)
    print("Removed {} Outliers".format((outliers < 0).sum()))
    X_train["outliers"] = outliers
    X_train = X_train[X_train["outliers"] > 0]
    Y_train["outliers"] = outliers
    Y_train = Y_train[Y_train["outliers"] > 0]
    X_train.drop(["outliers"], axis=1, inplace = True)
    Y_train.drop(["outliers"], axis=1, inplace = True)
    return X_train, Y_train

#Zero mean unit variance for train and test data
def scale_data(train, test):    
    scaler = StandardScaler().fit(train, Y_train)
    train = scaler.transform(train)
    test = scaler.transform(test)
   
    return train, test

def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

# Load the training data

In [2]:
#Load train and test set
train_data = pd.read_csv(TRAIN_FILE_PATH)
train_data.drop(train_data.columns[0], axis=1, inplace=True)

Y_train = pd.read_csv(TARGET_FILE_PATH)
Y_train.drop(Y_train.columns[0], axis=1, inplace = True)

test_data =  pd.read_csv(TEST_FILE_PATH)
id_test = test_data.columns[0]
test_data.drop(test_data.columns[0], axis=1, inplace=True)

# Handle NAN Values

In [4]:
train_data_mean, test_data_mean = fill_NaN(train_data, test_data)

Filling all nan values in the data with the mean


# Handle possible outliers

In [8]:
train_data_mean_no_outlier, Y_train_no_outlier = remove_outliers(train_data_mean, Y_train)
Y_train.drop(["outliers"], axis=1, inplace = True)

Removed 3 Outliers


# Scale the data

In [11]:
X_train_scaled, X_test_scaled = scale_data(train_data_mean_no_outlier, test_data_mean)

# Extract feature importance of Random Forest & find intersection with f_regression ones

In [13]:
n_features_fr = 150
n_features_rf = 100

features_scores = f_regression(X_train_scaled, Y_train_no_outlier)[0]
y = list(features_scores)
myarray = np.asarray(y)

indices_fr = myarray.argsort()[-n_features_fr:][::-1]

rng = np.random.RandomState(seed)
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=rng)
rf.fit(X_train_scaled,Y_train_no_outlier)

scores = list(rf.feature_importances_)
my_rf_features = np.asarray(scores)


indices_rf = my_rf_features.argsort()[-n_features_rf:][::-1]

indices = list(np.union1d(indices_rf, indices_fr))
print(f"Reduced to {len(indices)} features")

# reduce the train and test set
X_train_subset = train_data_mean[train_data_mean.columns[indices]]
X_test_subset = test_data_mean[train_data_mean.columns[indices]]

X_train_subset, X_test_subset = fill_NaN(X_train_subset, X_test_subset)
X_train_subset, X_test_subset = scale_data(X_train_subset, X_test_subset)

Reduced to 179 features
Filling all nan values in the data with the mean


# Create NN model

In [21]:
Y = Y_train
dropout = 0.1
#print(Y)
# create model
model = Sequential()
model.add(Dense(30, input_dim=len(indices), kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))
model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), init='RandomUniform'))
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(rate = dropout))

model.add(Dense(1, init='RandomUniform'))
# Compile model
optimizer = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[coeff_determination])
# Fit the model
print("Start fitting ...")
model.fit(x=X_train_subset, y=Y, epochs=80, verbose=0, validation_split=0.1, shuffle=True, steps_per_epoch=50, initial_epoch=0, validation_steps=5)
# calculate predictions
print("calculate predictions")
predictions = model.predict(X_test_subset)

Start fitting ...
calculate predictions


# Create finalsubmission csv

In [22]:
#Prepare predictions to be wirtten to csv
filename = "final.csv"
test_data =  pd.read_csv(TEST_FILE_PATH)
test_data["y"] = predictions
test_data[["id", "y"]].to_csv("submissions/"+filename, index= False)