In [44]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import model_selection
import matplotlib.pyplot as plt
import torch
import scipy.stats as stats
import torch.nn as nn

In [70]:
here = np.load('lasso.npy', allow_pickle=True)
here.shape

(1494, 10)

In [45]:
# Lets grab our data from the pca transformation
X_numpy = np.load('pca_transformed_data_v1.npy', allow_pickle=True)
y_numpy = np.load('label.npy', allow_pickle=True)

In [46]:
X = torch.from_numpy(X_numpy.astype(np.float32)).squeeze()
y = torch.from_numpy(y_numpy.astype(np.float32))

In [38]:
#lets split our data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2,
                                                                           random_state= 51)

In [39]:
#send the data into gpu
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
X_train,X_test, y_train,y_test = X_train.to(device), X_test.to(device), y_train.to(device), y_test.to(device)

In [40]:
# lets create a model for hyperparameter tuning
class NeuralNetworkWithHyper(nn.Module):
    def __init__(self,hidden_layers, num_hidden_units, input_size=18):
        super().__init__()
        self.input_size = input_size
        self.hidden_layers = hidden_layers
        self.num_hidden_units = num_hidden_units
        
        # Create a list to hold the layers
        self.layers = nn.ModuleList()
        
        #add the input layer
        self.layers.append(nn.Linear(input_size, num_hidden_units))
        self.layers.append(nn.ReLU())
       
        #add hidden layers
        for i in range(hidden_layers - 1):
            self.layers.append(nn.Linear(num_hidden_units, num_hidden_units))
            self.layers.append(nn.ReLU())
        #add the output layer
        self.layers.append(nn.Linear(num_hidden_units, 1))
        
        #add forward function
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

### Study #1 Neural Network

In [58]:
#Lets set our loss function and optimizer
import optuna

def objective(trial):
    
    #define the search space
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-3)
    hidden_layers = trial.suggest_int("hidden_layers", 1, 2)
    num_hidden_units = trial.suggest_int("num_hidden_units", 26, 46)
    epoch_count = trial.suggest_int("epoch_count", 9000, 12000)
    weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-3)
    
    #create the model
    model = NeuralNetworkWithHyper(hidden_layers, 
                                   num_hidden_units 
                                   ).to('mps')
    optimizer = torch.optim.SGD(model.parameters(), learning_rate, weight_decay)
    loss_fn = torch.nn.L1Loss()
    
    #testing loop
    for epoch in range(epoch_count):
        #convert into train
        model.train()

        #fit data
        train_pred = model(X_train)

        #calculate loss
        train_loss = loss_fn(train_pred, y_train)


        #zero gradients
        optimizer.zero_grad()

        #back propagation
        train_loss.backward()

        #gradient descent
        optimizer.step()

        #testing
        model.eval()
        with torch.inference_mode():
            test_pred = model(X_test)
            test_loss = loss_fn(test_pred, y_test)
            spear_pred = test_pred.to('cpu')
            
    #spearman correlation coefficient       
    validation_metric = stats.spearmanr(y_test.to('cpu'), spear_pred).statistic
    
    #save model weights
    trial.set_user_attr('model_weights', model.state_dict())
    
    return validation_metric

#Create an Optuna Study
study = optuna.create_study(direction='maximize')

#Perform hyperparameter optimization
study.optimize(objective, n_trials=100)

#Print best spearman correlation coefficient
best_validation_metric = study.best_value

[I 2023-11-09 11:42:26,658] A new study created in memory with name: no-name-a48ce8f0-568f-4599-b3dd-8e905ecea162
[I 2023-11-09 11:43:09,675] Trial 0 finished with value: 0.20064380379922508 and parameters: {'learning_rate': 0.0006320056287082589, 'hidden_layers': 2, 'num_hidden_units': 32, 'epoch_count': 9912, 'weight_decay': 0.000728828372857686}. Best is trial 0 with value: 0.20064380379922508.
[I 2023-11-09 11:43:51,389] Trial 1 finished with value: 0.23450287357228772 and parameters: {'learning_rate': 0.0008314739688211142, 'hidden_layers': 2, 'num_hidden_units': 41, 'epoch_count': 9575, 'weight_decay': 0.00014672286486376343}. Best is trial 1 with value: 0.23450287357228772.
[I 2023-11-09 11:44:44,729] Trial 2 finished with value: 0.21946230419124352 and parameters: {'learning_rate': 0.0006654309421044862, 'hidden_layers': 2, 'num_hidden_units': 34, 'epoch_count': 11597, 'weight_decay': 0.000312678003760284}. Best is trial 1 with value: 0.23450287357228772.
[I 2023-11-09 11:45:27

In [59]:
 best_trial = study.best_trial
score = best_trial.value
best_params = best_trial.params
print(f'Best score: {score} \nBest Hyperparameters: ')
for key,value in best_params.items():
    print(f'{key}: {value}')

Best score: 0.3062523624712202 
Best Hyperparameters: 
learning_rate: 0.0009046786299808608
hidden_layers: 2
num_hidden_units: 44
epoch_count: 9664
weight_decay: 0.0008425189043585393


In [60]:
# Save model 
from pathlib import Path
best_model_weights = best_trial.user_attrs.get('model_weights', None)
# 1. Create models directory
MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path
MODEL_NAME = "Neural_1.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# 3. Save state_dict
print(f'Save model to: {MODEL_SAVE_PATH}')
torch.save(obj=best_model_weights, f=MODEL_SAVE_PATH)

Save model to: models/Neural_1.pth


### Study #2 Neural Network

In [80]:
def objective(trial):
    #define the search space
    learning_rate= trial.suggest_float("learning_rate", 1e-4, 1e-3)
    hidden_layers= trial.suggest_int("hidden_layers", 1, 1)
    num_hidden_units= trial.suggest_int("num_hidden_units", 16, 46)
    epoch_count= trial.suggest_int("epoch_count", 9000, 11000)
    weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-3)
    
    #create the model
    model = NeuralNetworkWithHyper(hidden_layers, 
                                   num_hidden_units,
                                  ).to('mps')
    optimizer = torch.optim.SGD(model.parameters(), learning_rate, weight_decay
                               )
    loss_fn = torch.nn.L1Loss()
    
    #testing loop
    for epoch in range(epoch_count):
        #convert into train
        model.train()

        #fit data
        train_pred = model(X_train)

        #calculate loss
        train_loss = loss_fn(train_pred, y_train)


        #zero gradients
        optimizer.zero_grad()

        #back propagation
        train_loss.backward()

        #gradient descent
        optimizer.step()

        #testing
        model.eval()
        with torch.inference_mode():
            test_pred = model(X_test)
            test_loss = loss_fn(test_pred, y_test)
            spear_pred = test_pred.to('cpu')
     
    #save model weights
    trial.set_user_attr('model_weights', model.state_dict())
    
    #spearman correlation evaluation        
    validation_metric = stats.spearmanr(y_test.to('cpu'), spear_pred).statistic
    return validation_metric

#Create an Optuna Study
study = optuna.create_study(direction='maximize')

#Perform hyperparameter optimization
study.optimize(objective, n_trials=100)

#Print best spearman correlation coefficient
best_validation_metric = study.best_value

[I 2023-11-13 11:04:20,901] A new study created in memory with name: no-name-5e266c47-670f-43f1-88ff-568da9a9776c
[I 2023-11-13 11:04:43,107] Trial 0 finished with value: 0.23844960050271136 and parameters: {'learning_rate': 0.0006639474453701077, 'hidden_layers': 1, 'num_hidden_units': 37, 'epoch_count': 9011, 'weight_decay': 0.000135004972249129}. Best is trial 0 with value: 0.23844960050271136.
[I 2023-11-13 11:05:07,671] Trial 1 finished with value: 0.19286190787982227 and parameters: {'learning_rate': 0.0007870254513215061, 'hidden_layers': 1, 'num_hidden_units': 31, 'epoch_count': 10250, 'weight_decay': 0.0008154235315091069}. Best is trial 0 with value: 0.23844960050271136.
[I 2023-11-13 11:05:34,404] Trial 2 finished with value: 0.2145840572266244 and parameters: {'learning_rate': 0.0009662799090055908, 'hidden_layers': 1, 'num_hidden_units': 37, 'epoch_count': 10822, 'weight_decay': 0.00012248821239373354}. Best is trial 0 with value: 0.23844960050271136.
[I 2023-11-13 11:06:0

In [78]:
best_trial = study.best_trial
best_params = best_trial.params
best_score = best_trial.value
print(f'Best Score: {best_score} \nHyperparameters: ')
for key,value in best_params.items():
    print(f'{key}: {value}')

Best Score: 0.3472488958701495 
Hyperparameters: 
learning_rate: 0.0009532446966730183
hidden_layers: 1
num_hidden_units: 28
epoch_count: 10079
weight_decay: 0.0004377217621912963


In [79]:
from pathlib import Path
best_model_weights = best_trial.user_attrs.get('model_weights', None)
# 1. Create models directory
MODEL_PATH = Path('models')
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path
MODEL_NAME = "Neural_2.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# 3. Save state_dict
print(f'Save model to: {MODEL_SAVE_PATH}')
torch.save(obj=best_model_weights, f=MODEL_SAVE_PATH)

Save model to: models/Neural_2.pth


### Study #3 Random Forest

In [24]:
X_train, X_test, y_train, y_test = X_train.to('cpu'), X_test.to('cpu'), y_train.to('cpu').squeeze(), y_test.to('cpu').squeeze()

In [25]:
X_train.numpy(), X_test.numpy(), y_train.numpy(), y_test.numpy()

(array([[-1.8840996 , -1.2298406 , -0.08687859, ...,  0.21039778,
         -0.24490474, -1.0782467 ],
        [-3.0310247 , -1.0652962 , -0.6179891 , ...,  0.06913669,
         -0.13375966, -0.15979725],
        [-2.4773939 , -0.24320649, -2.6922863 , ...,  0.37434563,
         -0.5727486 ,  0.52930886],
        ...,
        [ 4.008305  , -3.0478313 , -0.4981839 , ..., -0.25590926,
         -0.03921561,  0.6463667 ],
        [-2.3074343 ,  2.9922462 ,  2.0321922 , ..., -0.24898356,
          0.5775757 ,  0.80868113],
        [ 3.7405992 , -0.9258261 ,  0.52959573, ..., -0.14529   ,
         -0.31167406,  0.21507934]], dtype=float32),
 array([[-2.7682261 , -0.8817636 ,  2.15579   , ...,  0.8664899 ,
          0.33826873,  0.36335376],
        [ 1.9744093 ,  5.584851  , -1.1742501 , ..., -1.0187863 ,
         -0.7615747 ,  0.53597295],
        [ 1.2637563 , -2.2719896 ,  0.30258185, ...,  1.1502167 ,
          0.83102214,  0.27453494],
        ...,
        [-0.25826934,  1.5601684 ,  1.5

In [26]:
y_train.shape, X_train.shape

(torch.Size([1195]), torch.Size([1195, 18]))

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def objective(trial):
    
    #define the search space
    n_estimators= trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
    
    #create the model
    model = RandomForestRegressor(n_estimators,
                                 max_depth=max_depth,
                                 min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=42)
    
    #training & testing loop
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_test)
    
    #make predictions on data set
    mse = mean_squared_error(y_test, model_predictions)
    return mse


#Create an Optuna Study
study_3 = optuna.create_study(direction='minimize')

#Perform hyperparameter optimization
study_3.optimize(objective, n_trials=100)

#Print best spearman correlation coefficient
best_validation_metric = study_3.best_value

[I 2023-11-08 07:10:18,089] A new study created in memory with name: no-name-c96795f6-48c6-4108-9300-6d195490255d
[I 2023-11-08 07:10:18,461] Trial 0 finished with value: 0.766444713967228 and parameters: {'n_estimators': 876, 'max_depth': 7, 'min_samples_split': 0.7474505613834166, 'min_samples_leaf': 0.27800876599230734}. Best is trial 0 with value: 0.766444713967228.
[I 2023-11-08 07:10:18,674] Trial 1 finished with value: 0.7663947576396374 and parameters: {'n_estimators': 539, 'max_depth': 30, 'min_samples_split': 0.16564730785898213, 'min_samples_leaf': 0.3284254470655301}. Best is trial 1 with value: 0.7663947576396374.
[I 2023-11-08 07:10:18,968] Trial 2 finished with value: 0.7413064114503799 and parameters: {'n_estimators': 211, 'max_depth': 14, 'min_samples_split': 0.465179543553217, 'min_samples_leaf': 0.1605275779823189}. Best is trial 2 with value: 0.7413064114503799.
[I 2023-11-08 07:10:19,095] Trial 3 finished with value: 0.7664167687107659 and parameters: {'n_estimator