In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
from utils import fig_ax, load_data, save_results

In [None]:
df_train_val, s_train_val, df_test = load_data()

In [None]:
# send everything to numpy arrays
X_train_val = df_train_val.to_numpy()
y_train_val = s_train_val.to_numpy()
X_test = df_test.to_numpy()

## Basic Regressors

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor

# build a list of dicts that says which classifier heads to test, and what params to test on them
search_params = [
    {
        "pca__n_components": [50],
        "reg": [LinearRegression()]  # our baseline: mean test score of -0.059320
    },
    {
        "pca__n_components": [250],
        "reg": [svm.NuSVR(cache_size=1000)],  # -0.046590; public score 0.0328 
        "reg__nu": [1],
        "reg__C": [0.3]
    },
    # {
    #     "pca__n_components": [50, 100, 150, 200, 250, "mle"],
    #     "reg": []
    # },
    # {
    #     "pca__n_components": [50],
    #     "reg": [ExtraTreesRegressor()]  # -0.047374
    # },
    # {
    #     "pca__n_components": [250],
    #     "reg": [GaussianProcessRegressor()] #-0.049241
    # },
    # {
    #     "pca__n_components": [50],
    #     "reg": [RandomForestRegressor()],  # -0.049370
    #     "reg__n_estimators": [100]
    # },
    # {
    #     "pca__n_components": [50],
    #     "reg": [Ridge()] #-0.059013
    # },
    # {
    #     "pca__n_components": [250],
    #     "reg": [svm.SVR(cache_size=1000)],  # -0.046590
    #     "reg__epsilon": [3.6e-5],
    #     "reg__C": [0.3]
    # {
    #     "pca__n_components": [150],
    #     "reg": [GradientBoostingRegressor()],  # -0.051251
    #     "reg__learning_rate": [0.1]
    # },
    # },
    # {
    #     "pca__n_components": [250],
    #     "reg": [SGDRegressor()] # -0.060203
    # },
    # {
    #     "pca__n_components": [150],
    #     "reg": [Lasso()],
    #     "reg__alpha": np.linspace(0.25, 4, 10) # -0.099515
    # },
]


In [None]:
# initialise the pipeline
pipe = Pipeline([
    ("pca", PCA(n_components=150)), # could also use "mle"
    ("reg", LinearRegression())
])

In [None]:
#split dataset into training and validation
kf = KFold(n_splits=5, random_state=42, shuffle=True).split(X_train_val, y_train_val)

search = GridSearchCV(
    pipe,
    search_params,
    #n_iter=100,
    cv=kf,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    pre_dispatch="2*n_jobs",
    verbose=1
)

search.fit(X_train_val, y_train_val)
search.best_score_

In [None]:
results = pd.DataFrame(search.cv_results_).drop(columns = [f"split{i}_test_score" for i in range(5)] + ["std_fit_time", "std_score_time"])
# results.sort_values("rank_test_score", inplace=True)
results

In [None]:
search.best_params_

In [None]:
pca = search.best_estimator_[0]
regressor = search.best_estimator_[1]
pca.explained_variance_ratio_.sum()

In [None]:
# compute predictions:
y_pred = search.predict(X_test)

In [None]:
save_results(y_pred, df_test.index)