In [7]:
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np


import re

from functools import partial
from scipy.stats import mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer, PowerTransformer, PolynomialFeatures, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RepeatedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score, median_absolute_error, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklego.linear_model import LADRegression

import tensorflow as tf
from tensorflow import keras
import tensorflow_probability as tfp

In [8]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")

In [9]:
X = train.drop(columns = ['ID', 'y'], axis = 1)
Y = train['y']

test = test.drop(columns="ID", axis = 1)
skf = RepeatedKFold(n_splits = 10, n_repeats = 1, random_state = 42)
kf = KFold(n_splits = 10, shuffle = True, random_state = 1)

In [10]:
def loss_fn(y_true, y_pred):
    
    return tfp.stats.percentile(tf.abs(y_true - y_pred), q = 50)

In [11]:
def create_model():

    input_layer = tf.keras.Input(shape = (12, ))
    x = tf.keras.layers.BatchNormalization(epsilon = 0.00001)(input_layer)
    x = tf.keras.layers.Dense(16, activation = 'relu')(x)
    x = tf.keras.layers.Dense(32, activation = 'relu')(x)
    output_layer = tf.keras.layers.Dense(1)(x)    

    model = tf.keras.Model(inputs = input_layer, outputs = output_layer)

    model.compile(optimizer = tf.keras.optimizers.Adam(0.013, beta_1 = 0.5),
                  loss = loss_fn)

    return model

callbacks_list = [
        tf.keras.callbacks.EarlyStopping(
            monitor = 'loss', patience = 30, verbose = 0, mode = 'min', restore_best_weights = True),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor = 'lr', factor = 0.8, patience = 3, min_lr = 0.00001),
        tf.keras.callbacks.TerminateOnNaN()
    ] 

In [12]:
tf_scores, tf_preds = list(), list()

for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

    print('--------------------------------------------------------------')
    
    x_train = X_train.copy()
    x_test = X_test.copy()
    test_cv = test.copy()
    
    ##########
    ## LGBM ##
    ##########
    
    LGBM_md = LGBMRegressor().fit(X_train, Y_train)

    x_train.loc[:, 'LGBM_1'] = LGBM_md.predict(X_train)
    x_test.loc[:, 'LGBM_1'] = LGBM_md.predict(X_test)
    test_cv.loc[:, 'LGBM_1'] = LGBM_md.predict(test)
    
    ########
    ## NN ##
    ########
    
    nn_md = create_model()
    nn_md.fit(x_train, Y_train,
              epochs = 100, 
              verbose = 0, 
              callbacks = callbacks_list)
    
    tf_md_pred = nn_md.predict(x_test, verbose = 0)
    tf_md_pred_test = nn_md.predict(test_cv, verbose = 0)
    tf_preds.append(tf_md_pred_test)
    
    tf_score = median_absolute_error(Y_test, tf_md_pred)
    
    tf_scores.append(tf_score)
    print('Fold', i, '==> TF oof median absolute error score is ==>', tf_score)

print(f"The TF oof median absolute error is {np.mean(tf_scores)}")

--------------------------------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 36106, number of used features: 11
[LightGBM] [Info] Start training from score 83.635067
Fold 0 ==> TF oof median absolute error score is ==> 0.9013832092285128
--------------------------------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 36106, number of used features: 11
[LightGBM] [Info] Start training from score 83.636859
Fold 1 ==> TF oof median absolute error score is ==> 0.9335693359374986
--------------------------------------------

In [13]:
tf_oof_preds = pd.DataFrame(np.concatenate(tf_preds, axis = 1))

submission_4 = sub.copy()
submission_4['y'] = tf_oof_preds.apply(np.median, axis = 1)
submission_4.head()

Unnamed: 0,ID,y
0,TEST_0000,83.490631
1,TEST_0001,82.553543
2,TEST_0002,90.023483
3,TEST_0003,90.564728
4,TEST_0004,82.456024


In [14]:
submission_4.to_csv('TF_sub.csv', index = False)