In [1]:
import sys
sys.path.append("../src")

%matplotlib inline
from time import time, sleep
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df_x_train = pd.read_csv("../data/X_train.csv", index_col="id")
df_y_train = pd.read_csv("../data/y_train.csv", index_col="id")

In [3]:
# Split the data
X_train_val, X_test, y_train_val, y_test = train_test_split(df_x_train, df_y_train, test_size=0.15, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.15, random_state=1) 

In [4]:
print(f"#Features: {df_x_train.shape[1]}\n#observations: {df_x_train.shape[0]}")

#Features: 832
#observations: 1212


## 1) Variable selection

In [5]:
fs_model = XGBRegressor(
    n_estimators = 1000,
    learning_rate = 0.015,
    random_state = 42
)

fs = SelectFromModel(estimator=fs_model).fit(X_train, y_train)

In [6]:
dim_rm = X_train.shape[0] - fs.transform(X_train).shape[1]
print(f'Removed features {dim_rm}')

Removed features 671


## 2) Model evaluation

In [7]:
selector_model = XGBRegressor(
    n_estimators = 1000,
    learning_rate = 0.02,
    random_state = 42
)

pipe = Pipeline([
('scaler', preprocessing.RobustScaler())
, ('imputer', KNNImputer())
, ('feature_selector', SelectFromModel(selector_model))
, ('regression_model', XGBRegressor())
])

pipe.set_params(
    imputer__missing_values=np.nan, 
    imputer__n_neighbors=5,
    regression_model__n_jobs=9,
    regression_model__n_estimators=800,
    regression_model__learning_rate=0.002,
    regression_model__subsample=0.6
)

# Train model
pipe.fit(X_train, np.array(y_train).ravel())

In [8]:
# Evaluate model on validation set
train_pred = pipe.predict(X_train)
val_pred = pipe.predict(X_val)

validation_score = round(r2_score(y_val, val_pred), 3)
train_score = round(r2_score(y_train, train_pred), 3)
print(f"Training score {train_score}")
print(f"Validation score {validation_score}") # 0.628 / 0.413 with response capper at 60, 80

Training score 0.784
Validation score 0.474


In [9]:
# Distribution of predictions and ground truth in training data
pd.concat([pd.Series(np.array(y_train).ravel()).describe(), pd.Series(train_pred).describe()], axis=1)

Unnamed: 0,0,1
count,875.0,875.0
mean,69.858286,69.896515
std,9.628953,5.831247
min,43.0,56.388222
25%,64.0,65.660183
50%,70.0,70.566734
75%,77.0,74.469601
max,97.0,82.451622


In [10]:
# Distribution of predictions and response in validation data
pd.concat([pd.Series(np.array(y_val).ravel()).describe(), 
           pd.Series(val_pred).describe()], axis=1)

Unnamed: 0,0,1
count,155.0,155.0
mean,69.735484,69.65287
std,9.611328,4.997076
min,48.0,59.050407
25%,64.5,65.871391
50%,70.0,69.229111
75%,76.0,73.828186
max,95.0,79.106949


In [11]:
_d = {'y': np.array(y_val).ravel(), 'pred': val_pred}
df_eval_val = pd.DataFrame(data=_d)
df_eval_val['res'] = df_eval_val['y'] - df_eval_val['pred']
df_eval_val['abs_res'] = df_eval_val['res'].apply(lambda x: abs(x))
df_eval_val.sort_values(by='abs_res', ascending=False, inplace=True)

In [12]:
# Analysis of residuals on validation data
df_eval_val.head(10)

Unnamed: 0,y,pred,res,abs_res
13,50.0,74.12574,-24.12574,24.12574
71,87.0,66.922058,20.077942,20.077942
154,89.0,69.925087,19.074913,19.074913
136,95.0,77.441048,17.558952,17.558952
51,48.0,64.810303,-16.810303,16.810303
52,94.0,77.576225,16.423775,16.423775
152,52.0,68.255524,-16.255524,16.255524
45,53.0,68.754494,-15.754494,15.754494
37,87.0,72.732285,14.267715,14.267715
119,53.0,66.017754,-13.017754,13.017754


In [13]:
df_eval_val['abs_res'].describe()

count    155.000000
mean       5.458330
std        4.317067
min        0.080528
25%        2.435486
50%        4.510811
75%        7.250212
max       24.125740
Name: abs_res, dtype: float64

## 3) Model testing

In [14]:
from sklearn.pipeline import Pipeline           
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from time import time, sleep

start_t = time()

selector_model = XGBRegressor(
    n_estimators = 850,
    learning_rate = 0.015,
    random_state = 42,
)

pipe = Pipeline([
('scaler', preprocessing.RobustScaler())
, ('imputer', KNNImputer())
, ('feature_selector', SelectFromModel(selector_model))
, ('regression_model', XGBRegressor())
])

pipe.set_params(
    imputer__missing_values=np.nan, 
    imputer__n_neighbors=5,
)

# Set grid / solution space
parameters = {
    'regression_model__n_estimators': [800],
    'regression_model__learning_rate': [0.002],
    'regression_model__subsample': [0.6]
}

# Train grided model
grided_model = GridSearchCV(pipe, parameters, scoring='r2', cv=25, n_jobs=9, return_train_score=True)
grided_model.fit(df_x_train, np.array(df_y_train).ravel())

print(f"Elapsed time {round((time()-start_t)/60, 3)} min")

Elapsed time 25.986 min


In [15]:
# Collect evaluation data into a data frame
df_cv_results = pd.DataFrame(grided_model.cv_results_)
df_model_params = df_cv_results.apply(lambda x: pd.Series(x["params"]), axis=1)
df_cv_results[df_model_params.columns] = df_model_params
df_cv_results.sort_values(by="rank_test_score", inplace=True)

In [17]:
df_cv_results[[
    'mean_test_score', 
    'mean_train_score', 
    'std_test_score',
    'std_train_score',
    'mean_fit_time', 
    'std_fit_time'
]]

Unnamed: 0,mean_test_score,mean_train_score,std_test_score,std_train_score,mean_fit_time,std_fit_time
0,0.455022,0.765513,0.103933,0.00264,481.290682,44.298103
