In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Real
import joblib

np.set_printoptions(precision=5)

In [2]:
# utility functions:
def load_data(directory, datafilename):
  f = open(f'{directory}/{datafilename}.csv', 'r')
  data = np.array(
      [[float(item2) for item2 in item.split(',') if len(item2)>0]
       for item in f.read().split('\n')[1:] if len(item)>0]  # includes the ID column also
  )
  return data

def split_data(data, train_fr, valid_fr):
  np.random.seed(1)
  np.random.shuffle(data)
  train_data = data[:int(data.shape[0]*train_fr), :]
  valid_data = data[int(data.shape[0]*train_fr):int(data.shape[0]*(train_fr+valid_fr)), :]
  test_data = data[int(data.shape[0]*(train_fr+valid_fr)):, :]
  return train_data, valid_data, test_data

In [3]:
'''
training over D7:
'''
data = load_data(directory='D7', datafilename=f'trainD7')
ID_train, X_train, y_train = data[:, 0], data[:, 1:-4], data[:, -4:]


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

model = Ridge()
param_space = {
  'alpha': Real(1e-6, 1e+6, 'log-uniform'),  # default: 1
}
opt = BayesSearchCV(
  estimator=model, search_spaces=param_space, n_iter=50, cv=3,
  n_jobs=-1, n_points=5, verbose=3, scoring='neg_mean_squared_error'
)
opt.fit(X_train, y_train)

results_df = pd.DataFrame(opt.cv_results_)
results_df.to_csv(f'multOutRidge_optResults.csv', index=False)
best_params = opt.best_params_
model = Ridge(**best_params)
model.fit(X_train, y_train)

joblib.dump(model, f'multOutRidge.joblib', compress=3)
joblib.dump(scaler, f'multOutRidge_scaler.joblib', compress=3)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Fitting 3 folds for each of 5 candidates, totalling 15 fits


['multOutRidge_scaler.joblib']

In [6]:
'''
evaluation over D8 and D9:
'''

model = joblib.load(f'final_multOutRidge.joblib')
scaler = joblib.load(f'final_multOutRidge_scaler.joblib')


data = load_data(directory='D8', datafilename=f'testD8')
ID, X1, y1 = data[:, 0], data[:, 1:-4], data[:, -4:]
data = load_data(directory='D9', datafilename=f'testD9')
ID, X2, y2 = data[:, 0], data[:, 1:-4], data[:, -4:]

X1 = scaler.fit_transform(X1)
X2 = scaler.fit_transform(X2)


print('\nEvaluation over D8:')
X_test, y_test = X1, y1
y_pred = model.predict(X_test)
mse_raw = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print(f"Mean Squared Error (raw): {mse_raw}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (combined): {mse:.2e}")
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

print('\nEvaluation over D9:')
X_test, y_test = X2, y2
y_pred = model.predict(X_test)
mse_raw = mean_squared_error(y_test, y_pred, multioutput='raw_values')
print(f"Mean Squared Error (raw): {mse_raw}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (combined): {mse:.2e}")
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)


Evaluation over D8:
Mean Squared Error (raw): [0.0005  0.00022 0.00029 0.00029]
Mean Squared Error (combined): 3.25e-04
R² Score: 0.9831810121372193

Evaluation over D9:
Mean Squared Error (raw): [0.0058  0.02377 0.00337 0.00499]
Mean Squared Error (combined): 9.48e-03
R² Score: 0.5105436675131789
