In [None]:
import numpy as np
import pandas as pd

In [None]:
from utils import fig_ax, load_data, save_results

In [None]:
df_train_val, s_train_val, df_test = load_data()

In [None]:
# send everything to numpy arrays
X_train_val = df_train_val.to_numpy()
y_train_val = s_train_val.to_numpy()
X_test = df_test.to_numpy()

## Basic Regressors

In [None]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor, Lasso
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

# build a list of dicts that says which classifier heads to test, and what params to test on them
search_params = [
    {
        "pca__n_components": [50],
        "reg": [LinearRegression()]  # our baseline: mean test score of -0.059320
    },
    {
        "pca__n_components": [150],
        "reg": [Lasso(alpha=0.01)],  # -0.066543
    },
    {
        "pca__n_components": [50],
        "reg": [Ridge()]  # -0.059013
    },
    {
        "pca__n_components": [250],
        "reg": [SGDRegressor()]  # -0.060203
    },
    {
        "pca__n_components": [250],
        "reg": [GaussianProcessRegressor()]  # -0.049241
    },
    {
        "pca__n_components": [250],
        "reg": [svm.SVR(epsilon=3.6e-5, C=0.3, cache_size=1000)],  # -0.046590
    },
    {
        "pca__n_components": [250],
        "reg": [svm.NuSVR(nu=1, C=0.3, cache_size=1000)],  # -0.046590; public score 0.0328
        "reg__nu": [1],
        "reg__C": [0.3]
    },
    {
        "pca__n_components": [50],
        "reg": [RandomForestRegressor(n_estimators=1000)],  # -0.048581
    },
    {
        "pca__n_components": [50],
        "reg": [ExtraTreesRegressor(n_estimators=1000)]  # -0.046971

    },
    {
        "pca__n_components": [150],
        "reg": [GradientBoostingRegressor(learning_rate=0.1)],  # -0.051251
    },

### NB: nous avons testé des grilles de paramètres pour chaque algorithme, par exemple:
    # {
    #     "pca__n_components": [50, 100, 150, 200, 250, "mle"],
    #     "reg": [svm.SVR(cache_size=1000)],
    #     "epsilon": np.logspace(-5, 0),
    #     "C": np.logspace(-5, 0),
    # },
]


In [None]:
# initialise the pipeline
pipe = Pipeline([
    ("pca", PCA(n_components=150)), # could also use "mle"
    ("reg", LinearRegression())
])

In [None]:
#split dataset into training and validation
kf = KFold(n_splits=5, random_state=42, shuffle=True).split(X_train_val, y_train_val)

search = GridSearchCV(
    pipe,
    search_params,
    #n_iter=100,
    cv=kf,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    pre_dispatch="2*n_jobs",
    verbose=1
)

search.fit(X_train_val, y_train_val)
search.best_score_

In [None]:
# display the results
results = pd.DataFrame(search.cv_results_)
# results.sort_values("rank_test_score", inplace=True)

# let's make some room so we can visualise the results:
results.drop(
    columns=[f"split{i}_test_score" for i in range(5)]
    + ["std_fit_time", "mean_score_time", "std_score_time", "std_test_score"]
)


In [None]:
# export the cross-validation results to csv
cols_to_export = ["param_reg", "param_pca__n_components", "mean_fit_time", "mean_test_score"]
results.sort_values("mean_test_score", inplace=True, ascending=False)
results[cols_to_export].to_csv("results.csv", index=False)

In [None]:
search.best_params_

In [None]:
pca = search.best_estimator_[0]
regressor = search.best_estimator_[1]
pca.explained_variance_ratio_.sum()

In [None]:
# compute predictions:
y_pred = search.predict(X_test)

In [None]:
save_results(y_pred, df_test.index)

## Deep Learning

In [None]:
import os
from sklearn.model_selection import train_test_split
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader,TensorDataset
from torch.optim import Adam
from torch import nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from copy import deepcopy

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Define Dataloader

In [None]:
df_train_val, s_train_val, df_test = load_data()

In [None]:
# send everything to numpy arrays
X_train_val = df_train_val.to_numpy()
y_train_val = s_train_val.to_numpy()
X_test = df_test.to_numpy()

In [None]:
from sklearn.decomposition import PCA
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
pca = PCA(n_components= 150)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

In [None]:
print(X_train.shape[1])

In [None]:
batch_size = 1

train_dataset = TensorDataset(torch.Tensor(X_train),torch.Tensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=1, shuffle=True)
val_dataset = TensorDataset(torch.Tensor(X_val),torch.Tensor(y_val))
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=1, shuffle=True)

In [None]:
len(val_dataset)

### Define network

In [None]:
class Regression(nn.Module):
    def __init__(self):
        super(Regression, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 1)

    def forward(self, x):
        x = self.fc1(x)
        return x


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class Transformers_model(nn.Module):
    def __init__(self):
        super(Transformers_model, self).__init__()
        encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.fc1 = nn.Linear(512, 1)

    def forward(self, x):
        x = self.transformer_encoder(x.unsqueeze(-1))
        x = self.fc1(x)
        return x


### Training Loop

In [None]:
def validation(model, val_loader):
  model.eval()
  val_criterion = torch.nn.MSELoss(reduction = 'sum')
  total_loss = 0.0
  with tqdm(val_loader, unit="batch") as batch:
        for data, target in batch:
            data, target = data.to(device), target.to(device)
            with torch.no_grad():
                output = model(data)
                loss = val_criterion(output, target)
                total_loss += loss.item()
  return total_loss/len(val_dataset)

In [None]:
#model = Regression()
model = Net()
#model = Transformers_model()
criterion = torch.nn.MSELoss(reduction='sum')
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

model.to(device)
best_val_loss = float("inf")
best_weights = None
model.train()
for epoch in range(1, 50):
    total_loss = 0.0
    learning_rate = 0.01 / (epoch + 1)
    model.train()
    with tqdm(train_loader, unit="batch") as tepoch:
        for data, target in tepoch:
            tepoch.set_description(f"Epoch {epoch}")

            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = torch.reshape(model(data), (1,))
            loss = criterion(output, target)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            tepoch.set_postfix({'Loss': total_loss / len(val_dataset), 'Learning Rate': learning_rate})
    val_loss = validation(model, val_loader)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_weights = deepcopy(model.state_dict())
    print(f"Val loss: {val_loss}\tBest val loss: {best_val_loss}")


In [None]:
for data, target in train_loader:
    print(data.shape)
    break

In [None]:
# compute predictions:
model.load_state_dict(best_weights)
test_transformed = pca.transform(X_test)
y_pred = []
for batch in test_transformed:
    test_tensor = torch.tensor(batch)
    test_tensor = test_tensor.reshape(1, -1).to(device).to(dtype=torch.float32)
    y_pred.append(model(test_tensor).item())
y_pred = np.array(y_pred)

In [None]:
save_results(y_pred, df_test.index)