In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import ElasticNetCV, RidgeCV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTRegressor

### Read & split data

In [3]:
dataset = pd.read_csv(r"../data/train.csv")

In [4]:
X = dataset.drop(["ID", "y"], axis=1)
y = dataset["y"]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

### LabelEncoder to categorical features

In [None]:
for c in X_train.columns:
    if X_train[c].dtype == "object":
        lbl = LabelEncoder()
        lbl.fit(list(X_train[c].values) + list(X_val[c].values))
        X_train[c] = lbl.transform(list(X_train[c].values))
        X_val[c] = lbl.transform(list(X_val[c].values))

In [8]:
print("Shape train: {}\nShape test: {}".format(X_train.shape, X_val.shape))

Shape train: (2820, 376)
Shape test: (1389, 376)


In [9]:
X_train.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
2218,8,23,38,2,3,15,3,8,0,0,...,0,0,1,0,0,0,0,0,0,0
1688,17,13,16,3,3,4,10,11,0,0,...,0,0,0,0,0,0,0,0,0,0
2242,38,4,16,2,3,15,9,20,0,0,...,0,0,1,0,0,0,0,0,0,0
582,45,10,16,3,3,8,9,5,0,0,...,0,0,1,0,0,0,0,0,0,0
2132,42,22,16,2,3,16,9,24,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
X_train.dtypes[X_train.dtypes == "object"]

Series([], dtype: object)

# Baseline

### Add decomposed components: PCA / ICA etc.

In [11]:
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca_results_train = pca.fit_transform(X_train)
pca_results_val = pca.transform(X_val)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica_results_train = ica.fit_transform(X_train)
ica_results_val = ica.transform(X_val)

X_train_pca = X_train.copy()
X_val_pca = X_val.copy()

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    X_train_pca["pca_" + str(i)] = pca_results_train[:, i - 1]
    X_val_pca["pca_" + str(i)] = pca_results_val[:, i - 1]

    X_train_pca["ica_" + str(i)] = ica_results_train[:, i - 1]
    X_val_pca["ica_" + str(i)] = ica_results_val[:, i - 1]

In [12]:
print(
    "Shape train_pca: {}\nShape test_pca: {}".format(X_train_pca.shape, X_val_pca.shape)
)

Shape train_pca: (2820, 396)
Shape test_pca: (1389, 396)


### Preparing Regressor

In [None]:
xgb_params = {
    "n_trees": 500,
    "eta": 0.005,
    "max_depth": 4,
    "subsample": 0.95,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "base_score": np.mean(y_train),
    "silent": 1,
}

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_val)

cv_result = xgb.cv(
    xgb_params,
    dtrain,
    num_boost_round=700,
    early_stopping_rounds=50,
    verbose_eval=50,
    show_stdv=False,
)

num_boost_rounds = len(cv_result)

model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [14]:
xgb_preds = model.predict(dtest)
print("xgboost baseline R^2 score: {}".format(r2_score(y_val, xgb_preds)))

xgboost baseline R^2 score: 0.49724882549769867


## TPOT

In [19]:
pipeline_optimizer = TPOTRegressor(
    generations=5, population_size=20, cv=5, random_state=42, verbosity=2
)
pipeline_optimizer.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -62.48977397940307

Generation 2 - Current best internal CV score: -62.48977397940307

Generation 3 - Current best internal CV score: -62.48977397940307

Generation 4 - Current best internal CV score: -61.877732723023904

Generation 5 - Current best internal CV score: -61.877732723023904

Best pipeline: ElasticNetCV(MinMaxScaler(RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.1, min_samples_leaf=19, min_samples_split=19, n_estimators=100)), l1_ratio=1.0, tol=1e-05)


TPOTRegressor(generations=5, population_size=20, random_state=42, verbosity=2)

In [21]:
tpot_preds = pipeline_optimizer.predict(X_val)
print("TPOTRegressor R^2 score: {}".format(r2_score(y_val, tpot_preds)))

TPOTRegressor R^2 score: 0.4815584928375357


## ElasticNetCV

In [15]:
model_elastic = ElasticNetCV()
model_elastic.fit(X_train, y_train)

ElasticNetCV()

In [16]:
elastic_preds = model_elastic.predict(X_val)
print("ElasticNetCV R^2 score: {}".format(r2_score(y_val, elastic_preds)))

ElasticNetCV R^2 score: 0.46590571785721


## RidgeCV

In [17]:
model_ridge = RidgeCV()
model_ridge.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [18]:
ridge_preds = model_ridge.predict(X_val)
print("RidgeCV R^2 score: {}".format(r2_score(y_val, ridge_preds)))

RidgeCV R^2 score: 0.4727112144198513
