In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from skopt.space import Real
from skopt.space import Integer
from skopt import BayesSearchCV
import joblib

In [2]:
def load_data(directory, datafilename):
  try:
    f = open(f'{directory}/{datafilename}.csv', 'r')
  except FileNotFoundError:
    print('data file is missing. generating now.')
    if 'D0' in directory or 'D1' in directory:
      os.system(f'python D0D1_generation.py')
    else:
      os.system(f'python {directory}_generation.py')
    print('file generated.')
    f = open(f'{directory}/{datafilename}.csv', 'r')
  data = np.array(
      [[float(item2) for item2 in item.split(',') if len(item2)>0]
       for item in f.read().split('\n')[1:] if len(item)>0]  # includes the ID column also
  )
  return data

def split_data(data, train_fr, valid_fr):
  np.random.seed(1)
  np.random.shuffle(data)
  train_data = data[:int(data.shape[0]*train_fr), :]
  valid_data = data[int(data.shape[0]*train_fr):int(data.shape[0]*(train_fr+valid_fr)), :]
  test_data = data[int(data.shape[0]*(train_fr+valid_fr)):, :]
  return train_data, valid_data, test_data

In [3]:
for rn_class in [1, 2, 3]:
  print(f'\n\n----------------------class-{rn_class}--------------------\n\n')

  # load data:
  data = load_data(directory='D6', datafilename=f'trainD6class{rn_class}')
  train_data, valid_data, test_data = split_data(data, train_fr=0.80, valid_fr=0.00)
  ID_train, X_train, y_train = train_data[:, 0], train_data[:, 1:-1], train_data[:, -1]
  ID_valid, X_valid, y_valid = valid_data[:, 0], valid_data[:, 1:-1], valid_data[:, -1]
  ID_test, X_test, y_test = test_data[:, 0], test_data[:, 1:-1], test_data[:, -1]

  model = Ridge()
  param_space = {
    'alpha': Real(1e-6, 1e+6, 'log-uniform'),  # default: 1
  }
  opt = BayesSearchCV(
    estimator=model, search_spaces=param_space, n_iter=20, cv=2,
    n_jobs=-1, n_points=5, verbose=3, scoring='neg_mean_squared_error'
  )
  opt.fit(X_train, y_train)
  results_df = pd.DataFrame(opt.cv_results_)
  results_df.to_csv(f'SNR_Ridge_class{rn_class}_results_df.csv', index=False)
  best_params = opt.best_params_
  model = Ridge(**best_params)
  model.fit(X_train, y_train)
  joblib.dump(model, f'SNR_Ridge_class{rn_class}_Exp.joblib', compress=3)

  # test model:
  # sort data for visualization:
  sorted_indices = np.argsort(y_test)
  X_test = X_test[sorted_indices]
  y_test = y_test[sorted_indices]
    
  print('\nModel Evaluation:')
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  print("Mean Squared Error:", mse)
  r2 = r2_score(y_test, y_pred)
  print("R² Score:", r2)
  correlation, _ = pearsonr(y_test, y_pred)
  print("Correlation coefficient:", correlation)
  # 
  deviation = 100*(y_test-y_pred)/y_test



----------------------class-1--------------------


Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Model Evaluation:
Mean Squared Error: 0.0042322732751173065
R² Score: 0.9497986330285532
Correlation coefficient: 0.9746641681146285


----------------------class-2--------------------


Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Model Evaluation:
Mean Squared Error: 0.004444249563937363
R² Score: 0.9555412045109575
Correlation coefficient: 0.9775315420738702


----------------------class-3--------------------


Fitting 2 folds for each of 5 candidates, totalling 10 fits
Fitting 2 folds for each of 5 c