In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import tensorflow as tf
from keras.layers import Input, Dense, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['TF_CUDNN_DETERMINISM'] = '1'
os.environ['TF_DETERMINISTIC_OPS'] = '1'

random_state = 1508

np.random.seed(random_state)
tf.random.set_seed(random_state)

In [None]:
# read data
X = pd.read_csv('X_train.csv')
y = pd.read_csv('y_train.csv')

X = X.drop('id', axis=1)
y = y.drop('id', axis=1)

print(X.shape, y.shape)

In [None]:
# missing values
X = X.fillna(X.median())

In [None]:
# outliers
iso = IsolationForest (max_samples=100, random_state=random_state, contamination='auto')
iso_preds = iso.fit_predict(X)
X = X[iso_preds==1]
y = y[iso_preds==1]

print(X.shape, y.shape)

In [None]:
# select columns
X_corr = X.copy()
X_corr['y'] = y.copy()

corr = X_corr.corr().y.dropna()

corr_thresh = 0.1
corr_features = list(corr[((corr > corr_thresh) | (corr < -corr_thresh)) & (corr.index != 'y')].index)

X = X[corr_features]

In [None]:
# data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)
print(X_train.shape, y_test.shape)

In [None]:
# normalize
X_train_norm = (X_train - X_train.mean()) / X_train.std()
X_test_norm = (X_test - X_test.mean()) / X_test.std()

y_mean = y.mean()[0]
y_std = y.std()[0]
y_train_norm = (y_train - y_mean) / y_std
y_test_norm = (y_test - y_mean) / y_std

In [None]:
# model

model = Sequential()
model.add(Dense(X_train.shape[1], activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='linear'))
model.add(Dense(1))

model.compile(loss='mean_absolute_error', optimizer='adam')
model.fit(X_train_norm, y_train_norm, epochs=40)

y_pred = model.predict(X_test_norm)
y_pred = y_pred*y_std + y_mean
    
# saving the score
score = r2_score(y_test, y_pred)

print(score)

In [None]:
# generate submission

X_sub = pd.read_csv('X_test.csv')

X_sub_id = X_sub.id
X_sub = X_sub.drop('id', axis=1)

print(X_sub.shape)

# missing values
X_sub = X_sub.fillna(X_sub.median())

# select columns
X_sub = X_sub[corr_features]

# normalize
X_sub_norm = (X_sub - X_sub.mean()) / X_sub.std()

y_sub_pred = model.predict(X_sub_norm)
y_sub_pred = y_sub_pred * y_std + y_mean

output = np.column_stack((np.array(X_sub_id), y_sub_pred))
df = pd.DataFrame(output, columns=['id', 'y'])
df.to_csv('submission.csv', index=False, header=True, sep=',')

print(df.shape)