In [16]:
import numpy as np

def calculate_iqs_unphased(true_genotypes, imputed_dosages):
    # Convert imputed dosages to discrete values
    imputed_discrete = np.round(imputed_dosages).astype(int)

    # Clip the imputed discrete values to be within the range of 0 to 2
    imputed_discrete = np.clip(imputed_discrete, 0, 2)

    # Create a contingency table
    contingency_table = np.zeros((3, 3), dtype=int)

    # Fill the contingency table
    for true_geno, imputed_geno in zip(true_genotypes, imputed_discrete):
        for true_allele, imputed_allele in zip(true_geno, imputed_geno):
            contingency_table[int(true_allele), int(imputed_allele)] += 1

    # Calculate the total number of genotypes
    total_genotypes = np.sum(contingency_table)

    # Calculate observed proportion of agreement (Po)
    observed_agreement = np.trace(contingency_table) / total_genotypes

    # Calculate marginal sums
    row_marginals = np.sum(contingency_table, axis=1)
    col_marginals = np.sum(contingency_table, axis=0)

    # Calculate chance agreement (Pc)
    chance_agreement = np.sum((row_marginals * col_marginals) / (total_genotypes ** 2))

    # Calculate IQS
    if chance_agreement == 1:  # To prevent division by zero in case of perfect chance agreement
        iqs_score = 0
    else:
        iqs_score = (observed_agreement - chance_agreement) / (1 - chance_agreement)

    return iqs_score

# Example usage:
true_genotypes = np.array([[0, 1, 2], [1, 2, 0], [2, 0, 1]])
imputed_dosages = np.array([[0.1, 1.2, 1.9], [1.0, 1.8, 0.3], [2.0, 0.5, 1.4]])

iqs_score = calculate_iqs_unphased(true_genotypes, imputed_dosages)
print(f"IQS Score: {iqs_score}")


IQS Score: 1.0


# Linear Regression Model

In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Create folders for saving files
output_folder = "../../Data/model_results_unphased_all_PRS/linear_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the linear regression model with L1 regularization
    class LinearRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LinearRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-10, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        lr_factor = trial.suggest_float('lr_factor', 0.1, 0.9)
        num_epochs = trial.suggest_int('num_epochs', 100, 500)

        model = LinearRegression(input_dim, output_dim, l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_factor, patience=patience, verbose=False)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs, batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)
    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"unphased_full_23andMe_chr{chromosome_number}_study_linear_regression"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists
    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")

    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)

    study.optimize(objective, n_trials=1, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']
    best_lr_factor = study.best_params['lr_factor']
    best_num_epochs = study.best_params['num_epochs']

    model = LinearRegression(input_dim, output_dim, best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=best_lr_factor, patience=best_patience, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(best_num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs.round()).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

    # Create a DataFrame to store the performance metrics for each chromosome
    performance_df = pd.DataFrame({
        'Chromosome': list(range(start, chromosome_number + 1)),
        'R2 Score': r2_scores,
        'IQS Score': iqs_scores,
        'Accuracy Score': accuracy_scores
    })

    # Save the performance metrics to a CSV file
    performance_csv_file = csv_folder + 'performance_metrics.csv'
    performance_df.to_csv(performance_csv_file, index=False)
    print(f"Performance metrics saved at: {performance_csv_file}")

Total SNPs:  929
PRS313 SNPs:  30
Total SNPs used for Training:  899


[I 2024-05-23 14:21:51,261] Trial 86 finished with value: 0.10622426457703113 and parameters: {'learning_rate': 0.005190997400957485, 'l1_coef': 4.3763132874855275e-08, 'patience': 18, 'batch_size': 128, 'lr_factor': 0.33527929693949376, 'num_epochs': 435}. Best is trial 49 with value: 0.10217271894216537.


Chr 1 - Best hyperparameters: {'learning_rate': 0.004525680413232911, 'l1_coef': 1.4557748222420537e-06, 'patience': 20, 'batch_size': 256, 'lr_factor': 0.2721642150080666, 'num_epochs': 367}
Chr 1 - Best value: 0.1022
Final model saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/models_unphased/chr1/final_model_chr1.pth
INPUTS [[0. 2. 0. ... 2. 1. 0.]
 [0. 2. 1. ... 2. 0. 0.]
 [0. 1. 0. ... 2. 0. 0.]
 ...
 [0. 2. 0. ... 2. 1. 0.]
 [0. 1. 0. ... 2. 1. 0.]
 [0. 0. 0. ... 2. 1. 0.]] [[ 0.21295525  1.1346838   0.02625206 ...  2.3359337  -0.04398361
   0.04737796]
 [ 0.84804386  2.4524505   1.5537536  ...  2.359358    0.11665091
  -0.02905133]
 [ 0.41301873  1.6406057  -0.12913103 ...  1.702893   -0.39570338
  -0.00645405]
 ...
 [ 0.3019755   1.7119876   0.0105901  ...  2.058869    0.85444266
   0.0405995 ]
 [-0.2700336   1.7962674  -0.04037734 ...  1.9678326   1.0011257
   0.24637793]
 [ 0.36895612  1.7164937  -0.05849757 ...  1.4100678   0.7813533
   0.22586088]]
ROUN

KeyboardInterrupt: 

In [26]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as sklearn_r2_score
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Folders for saved models and CSV files
output_folder = "../../Data/model_results_unphased_all_PRS/linear_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

for chromosome_number in range(start, 23):
    # Paths for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the linear regression model with L1 regularization
    class LinearRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LinearRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the final model
    model = LinearRegression(X_train_val.shape[1], y_train_val.shape[1])
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)
    model.eval()

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs.round()).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

# Create a DataFrame to store the performance metrics for each chromosome
performance_df = pd.DataFrame({
    'Chromosome': list(range(start, 23)),
    'R2 Score': r2_scores,
    'IQS Score': iqs_scores,
    'Accuracy Score': accuracy_scores
})

# Save the performance metrics to a CSV file
performance_csv_file = csv_folder + 'performance_metrics.csv'
performance_df.to_csv(performance_csv_file, index=False)
print(f"Performance metrics saved at: {performance_csv_file}")


Total SNPs:  929
PRS313 SNPs:  30
Total SNPs used for Training:  899
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr1/individual_r2_scores_chr1.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr1/individual_iqs_scores_chr1.csv
Total SNPs:  629
PRS313 SNPs:  21
Total SNPs used for Training:  608
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr2/individual_r2_scores_chr2.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr2/individual_iqs_scores_chr2.csv
Total SNPs:  963
PRS313 SNPs:  16
Total SNPs used for Training:  947
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr3/individual_r2_scores_chr3.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/linear_regression/csv_files/chr3

# Multinomial Logistic Regression Model

In [7]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Create folders for saving files
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.long)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the multinomial logistic regression model with L1 regularization
    class MulticlassLogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, num_classes=3, l1_coef=0.0):
            super(MulticlassLogisticRegression, self).__init__()
            self.num_classes = num_classes
            self.linear = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_classes)])
            self.l1_coef = l1_coef

        def forward(self, x):
            out = torch.stack([linear(x) for linear in self.linear], dim=-1)
            out = nn.functional.softmax(out, dim=-1)
            return out

        def l1_loss(self):
            return self.l1_coef * sum(torch.norm(linear.weight, p=1) for linear in self.linear)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-10, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        lr_factor = trial.suggest_float('lr_factor', 0.1, 0.9)
        num_epochs = trial.suggest_int('num_epochs', 100, 500)

        model = MulticlassLogisticRegression(input_dim, output_dim, l1_coef=l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_factor, patience=patience, verbose=False)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)
    
    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"unphased_full_23andMe_chr{chromosome_number}_study_logistic_regression"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists
    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")

    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)

    study.optimize(objective, n_trials=50, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']
    best_lr_factor = study.best_params['lr_factor']
    best_num_epochs = study.best_params['num_epochs']

    model = MulticlassLogisticRegression(input_dim, output_dim, l1_coef=best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=best_lr_factor, patience=best_patience, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(best_num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_outputs = test_outputs.argmax(dim=-1)
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())

        test_accuracy = ((y_test == test_outputs).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

    # Create a DataFrame to store the performance metrics for each chromosome
    performance_df = pd.DataFrame({
        'Chromosome': list(range(start, chromosome_number + 1)),
        'R2 Score': r2_scores,
        'IQS Score': iqs_scores,
        'Accuracy Score': accuracy_scores
    })

    # Save the performance metrics to a CSV file
    performance_csv_file = csv_folder + 'performance_metrics.csv'
    performance_df.to_csv(performance_csv_file, index=False)
    print(f"Performance metrics saved at: {performance_csv_file}")

Total SNPs:  929
PRS313 SNPs:  30
Total SNPs used for Training:  899


[I 2024-05-23 00:33:59,250] Trial 22 finished with value: 0.733684065937996 and parameters: {'learning_rate': 0.002220891376392196, 'l1_coef': 2.7694209571735434e-08, 'patience': 16, 'batch_size': 128, 'lr_factor': 0.3090166357776345, 'num_epochs': 359}. Best is trial 12 with value: 0.6769969142400301.
[I 2024-05-23 00:34:00,673] Trial 17 finished with value: 0.745872899889946 and parameters: {'learning_rate': 0.002282542929228127, 'l1_coef': 1.7678189359217486e-08, 'patience': 16, 'batch_size': 128, 'lr_factor': 0.2898999112507365, 'num_epochs': 365}. Best is trial 12 with value: 0.6769969142400301.
[I 2024-05-23 00:34:04,619] Trial 20 finished with value: 0.7228652864694596 and parameters: {'learning_rate': 0.00217582920731858, 'l1_coef': 2.533053034976939e-08, 'patience': 16, 'batch_size': 128, 'lr_factor': 0.3031199406133188, 'num_epochs': 355}. Best is trial 12 with value: 0.6769969142400301.
[I 2024-05-23 00:34:11,111] Trial 21 finished with value: 0.7277507990598678 and paramete

Chr 1 - Best hyperparameters: {'learning_rate': 0.0003659720828647503, 'l1_coef': 2.2917141824697583e-08, 'patience': 18, 'batch_size': 32, 'lr_factor': 0.3973506534926021, 'num_epochs': 386}
Chr 1 - Best value: 0.6770
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr1/final_model_chr1.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr1/individual_r2_scores_chr1.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr1/individual_iqs_scores_chr1.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  629
PRS313 SNPs:  21
Total SNPs used for Training:  608


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 01:06:23,934] Trial 12 finished with value: 1.6472469466073172 and parameters: {'learning_rate': 0.07619744115418815, 'l1_coef': 0.0025746994774138057, 'patience': 12, 'batch_size': 64, 'lr_factor': 0.8290381863688072, 'num_epochs': 130}. Best is trial 2 with value: 0.7424067212985113.
[I 2024-05-23 01:07:25,388] Trial 17 finished with value: 0.9643306629998344 and parameters: {'learning_rate': 0.03319278550826355, 'l1_coef': 7.475199555377638e-07, 'patience': 10, 'batch_size': 64, 'lr_factor': 0.46163221633999585, 'num_epochs': 161}. Best is trial 2 with value: 0.7424067212985113.
[I 2024-05-23 01:08:26,448] Trial 14 finished with value: 1.814862275123596 and parameters: {'learning_rate': 0.02812313145521612, 'l1_coef': 0.03017157290208268, 'patience': 14, 'batch_size': 256, 'lr_factor': 0.20656640855761435, 'num_epochs': 111}. Best is trial 2 with value: 0.7424067212985113.
[I 2024-05-2

Chr 2 - Best hyperparameters: {'learning_rate': 0.001813751389499825, 'l1_coef': 7.793207352687452e-08, 'patience': 9, 'batch_size': 32, 'lr_factor': 0.6069847013701387, 'num_epochs': 423}
Chr 2 - Best value: 0.6719
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr2/final_model_chr2.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_r2_scores_chr2.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_iqs_scores_chr2.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  963
PRS313 SNPs:  16
Total SNPs used for Training:  947


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 01:28:51,153] Trial 11 finished with value: 0.8762602269649505 and parameters: {'learning_rate': 0.059531359725103664, 'l1_coef': 3.004383384943592e-06, 'patience': 6, 'batch_size': 256, 'lr_factor': 0.19610770016997386, 'num_epochs': 227}. Best is trial 2 with value: 0.6965232074260712.
[I 2024-05-23 01:29:05,881] Trial 14 finished with value: 0.8786559939384461 and parameters: {'learning_rate': 0.05911358604497834, 'l1_coef': 5.599094683443257e-07, 'patience': 7, 'batch_size': 256, 'lr_factor': 0.19793238738417537, 'num_epochs': 256}. Best is trial 2 with value: 0.6965232074260712.
[I 2024-05-23 01:29:30,541] Trial 8 finished with value: 0.9674327043386606 and parameters: {'learning_rate': 0.05416042859861109, 'l1_coef': 0.0005462608969859107, 'patience': 6, 'batch_size': 32, 'lr_factor': 0.6626547979039249, 'num_epochs': 191}. Best is trial 2 with value: 0.6965232074260712.
[I 2024-05-

Chr 3 - Best hyperparameters: {'learning_rate': 0.0005975393104293414, 'l1_coef': 1.8829824683234204e-09, 'patience': 19, 'batch_size': 64, 'lr_factor': 0.8284959194436654, 'num_epochs': 341}
Chr 3 - Best value: 0.6159
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr3/final_model_chr3.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr3/individual_r2_scores_chr3.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr3/individual_iqs_scores_chr3.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  1261
PRS313 SNPs:  11
Total SNPs used for Training:  1250


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 02:00:33,174] Trial 11 finished with value: 2.4931845903396606 and parameters: {'learning_rate': 0.011245290580186019, 'l1_coef': 0.039325845432566106, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.8565831216668033, 'num_epochs': 103}. Best is trial 2 with value: 0.708555577482496.
[I 2024-05-23 02:01:02,844] Trial 7 finished with value: 1.0018303215503692 and parameters: {'learning_rate': 0.007727287316716521, 'l1_coef': 0.004856178056202063, 'patience': 19, 'batch_size': 256, 'lr_factor': 0.108760016441415, 'num_epochs': 257}. Best is trial 2 with value: 0.708555577482496.
[I 2024-05-23 02:01:45,801] Trial 14 finished with value: 0.9755349576473236 and parameters: {'learning_rate': 0.04068351138985178, 'l1_coef': 0.006803986206437246, 'patience': 20, 'batch_size': 256, 'lr_factor': 0.10579023596064392, 'num_epochs': 312}. Best is trial 2 with value: 0.708555577482496.
[I 2024-05-23 

Chr 4 - Best hyperparameters: {'learning_rate': 0.00039798697327114734, 'l1_coef': 7.559889040032319e-08, 'patience': 8, 'batch_size': 64, 'lr_factor': 0.7301102557055886, 'num_epochs': 197}
Chr 4 - Best value: 0.6802


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 02:26:19,283] A new study created in RDB with name: unphased_full_23andMe_chr5_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr4/final_model_chr4.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr4/individual_r2_scores_chr4.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr4/individual_iqs_scores_chr4.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  1289
PRS313 SNPs:  34
Total SNPs used for Training:  1255


[I 2024-05-23 02:27:21,202] Trial 3 finished with value: 0.9277197003364563 and parameters: {'learning_rate': 0.05518587130574987, 'l1_coef': 1.6028956002860537e-09, 'patience': 11, 'batch_size': 256, 'lr_factor': 0.37386405450917326, 'num_epochs': 346}. Best is trial 3 with value: 0.9277197003364563.
[I 2024-05-23 02:27:37,772] Trial 2 finished with value: 0.7720308808180001 and parameters: {'learning_rate': 0.0029887488601876497, 'l1_coef': 2.1965620760789075e-10, 'patience': 5, 'batch_size': 32, 'lr_factor': 0.15267135289836695, 'num_epochs': 439}. Best is trial 2 with value: 0.7720308808180001.
[I 2024-05-23 02:27:48,502] Trial 1 finished with value: 0.8217635035514832 and parameters: {'learning_rate': 7.817552333659174e-05, 'l1_coef': 2.5149723674784266e-07, 'patience': 8, 'batch_size': 128, 'lr_factor': 0.5902199729484313, 'num_epochs': 109}. Best is trial 2 with value: 0.7720308808180001.
[I 2024-05-23 02:28:03,345] Trial 7 finished with value: 1.2789726495742797 and parameters:

Chr 5 - Best hyperparameters: {'learning_rate': 0.0002151788812086546, 'l1_coef': 9.172285960688132e-10, 'patience': 13, 'batch_size': 32, 'lr_factor': 0.5201558908781635, 'num_epochs': 472}
Chr 5 - Best value: 0.6736


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 03:10:25,115] A new study created in RDB with name: unphased_full_23andMe_chr6_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr5/final_model_chr5.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr5/individual_r2_scores_chr5.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr5/individual_iqs_scores_chr5.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  639
PRS313 SNPs:  20
Total SNPs used for Training:  619


[I 2024-05-23 03:13:16,491] Trial 3 finished with value: 0.8941717880112785 and parameters: {'learning_rate': 0.03801522976806271, 'l1_coef': 0.00021693737321302141, 'patience': 15, 'batch_size': 64, 'lr_factor': 0.29432075164849697, 'num_epochs': 408}. Best is trial 3 with value: 0.8941717880112785.
[I 2024-05-23 03:13:32,124] Trial 1 finished with value: 0.8724246233701706 and parameters: {'learning_rate': 0.0012349034493688996, 'l1_coef': 0.0007052941734562021, 'patience': 12, 'batch_size': 128, 'lr_factor': 0.6686653510126879, 'num_epochs': 158}. Best is trial 1 with value: 0.8724246233701706.
[I 2024-05-23 03:13:34,479] Trial 8 finished with value: 2.8905107002991897 and parameters: {'learning_rate': 0.023940212974579947, 'l1_coef': 0.08500791924257381, 'patience': 12, 'batch_size': 32, 'lr_factor': 0.5551303884607407, 'num_epochs': 390}. Best is trial 1 with value: 0.8724246233701706.
[I 2024-05-23 03:14:01,770] Trial 10 finished with value: 0.9867209076881409 and parameters: {'l

Chr 6 - Best hyperparameters: {'learning_rate': 0.00022597360973217369, 'l1_coef': 5.909997824309047e-10, 'patience': 13, 'batch_size': 32, 'lr_factor': 0.532536940963171, 'num_epochs': 374}
Chr 6 - Best value: 0.6921


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 03:43:47,956] A new study created in RDB with name: unphased_full_23andMe_chr7_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr6/final_model_chr6.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr6/individual_r2_scores_chr6.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr6/individual_iqs_scores_chr6.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  465
PRS313 SNPs:  14
Total SNPs used for Training:  451


[I 2024-05-23 03:45:22,278] Trial 0 finished with value: 0.755464893579483 and parameters: {'learning_rate': 0.00768282871367125, 'l1_coef': 7.302863992907644e-08, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.741545578577424, 'num_epochs': 258}. Best is trial 0 with value: 0.755464893579483.
[I 2024-05-23 03:45:24,852] Trial 3 finished with value: 0.7537903666496277 and parameters: {'learning_rate': 0.007789656489822903, 'l1_coef': 2.2554315741102037e-08, 'patience': 16, 'batch_size': 256, 'lr_factor': 0.7089128354911581, 'num_epochs': 207}. Best is trial 3 with value: 0.7537903666496277.
[I 2024-05-23 03:45:48,505] Trial 9 finished with value: 0.739409139752388 and parameters: {'learning_rate': 0.0004377182685502889, 'l1_coef': 1.3809648905258546e-08, 'patience': 16, 'batch_size': 128, 'lr_factor': 0.5874415016831616, 'num_epochs': 163}. Best is trial 9 with value: 0.739409139752388.
[I 2024-05-23 03:46:10,211] Trial 5 finished with value: 0.7089438429245583 and parameters: {'lea

Chr 7 - Best hyperparameters: {'learning_rate': 0.0015564356529101401, 'l1_coef': 1.0715102665777893e-07, 'patience': 11, 'batch_size': 32, 'lr_factor': 0.26891139915486995, 'num_epochs': 353}
Chr 7 - Best value: 0.6721


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 04:20:21,554] A new study created in RDB with name: unphased_full_23andMe_chr8_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr7/final_model_chr7.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr7/individual_r2_scores_chr7.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr7/individual_iqs_scores_chr7.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  454
PRS313 SNPs:  21
Total SNPs used for Training:  433


[I 2024-05-23 04:21:22,163] Trial 3 finished with value: 0.969415831565857 and parameters: {'learning_rate': 0.012018709560853219, 'l1_coef': 0.003117091899813757, 'patience': 7, 'batch_size': 128, 'lr_factor': 0.24378659252839388, 'num_epochs': 240}. Best is trial 3 with value: 0.969415831565857.
[I 2024-05-23 04:21:37,619] Trial 6 finished with value: 0.8478156447410583 and parameters: {'learning_rate': 0.01667804012715168, 'l1_coef': 2.369671618699824e-06, 'patience': 19, 'batch_size': 256, 'lr_factor': 0.2760323751345609, 'num_epochs': 307}. Best is trial 6 with value: 0.8478156447410583.
[I 2024-05-23 04:21:47,549] Trial 7 finished with value: 0.7764997541904449 and parameters: {'learning_rate': 0.006827578446725621, 'l1_coef': 1.166749818008427e-10, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.7454650135814935, 'num_epochs': 471}. Best is trial 7 with value: 0.7764997541904449.
[I 2024-05-23 04:21:48,527] Trial 4 finished with value: 0.952445125579834 and parameters: {'learn

Chr 8 - Best hyperparameters: {'learning_rate': 0.0015038688916002082, 'l1_coef': 1.0467031640055659e-07, 'patience': 11, 'batch_size': 32, 'lr_factor': 0.5706429757133247, 'num_epochs': 170}
Chr 8 - Best value: 0.7282


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 04:58:47,556] A new study created in RDB with name: unphased_full_23andMe_chr9_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr8/final_model_chr8.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr8/individual_r2_scores_chr8.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr8/individual_iqs_scores_chr8.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  404
PRS313 SNPs:  14
Total SNPs used for Training:  390


[I 2024-05-23 05:02:13,774] Trial 4 finished with value: 0.7749934166669845 and parameters: {'learning_rate': 0.00021270511256418585, 'l1_coef': 1.2233759985017966e-05, 'patience': 6, 'batch_size': 128, 'lr_factor': 0.5724853514870011, 'num_epochs': 373}. Best is trial 4 with value: 0.7749934166669845.
[I 2024-05-23 05:03:02,681] Trial 7 finished with value: 0.8270303964614868 and parameters: {'learning_rate': 0.00027753357450268886, 'l1_coef': 0.00026972103227476235, 'patience': 6, 'batch_size': 128, 'lr_factor': 0.8374079049380806, 'num_epochs': 453}. Best is trial 4 with value: 0.7749934166669845.
[I 2024-05-23 05:03:16,766] Trial 1 finished with value: 0.7851683882566599 and parameters: {'learning_rate': 8.224134563426584e-05, 'l1_coef': 1.195948800075714e-07, 'patience': 11, 'batch_size': 32, 'lr_factor': 0.5488202133798293, 'num_epochs': 197}. Best is trial 4 with value: 0.7749934166669845.
[I 2024-05-23 05:03:27,556] Trial 0 finished with value: 0.7338852579777058 and parameters

Chr 9 - Best hyperparameters: {'learning_rate': 0.00047974507044833283, 'l1_coef': 2.169265599662327e-08, 'patience': 17, 'batch_size': 32, 'lr_factor': 0.29558580994695216, 'num_epochs': 495}
Chr 9 - Best value: 0.7072


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 05:28:07,282] A new study created in RDB with name: unphased_full_23andMe_chr10_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr9/final_model_chr9.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr9/individual_r2_scores_chr9.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr9/individual_iqs_scores_chr9.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  608
PRS313 SNPs:  18
Total SNPs used for Training:  590


[I 2024-05-23 05:30:11,452] Trial 7 finished with value: 2.7592803120613096 and parameters: {'learning_rate': 0.09241951751028042, 'l1_coef': 0.010173613598039984, 'patience': 16, 'batch_size': 256, 'lr_factor': 0.6888787740522878, 'num_epochs': 106}. Best is trial 7 with value: 2.7592803120613096.
[I 2024-05-23 05:30:23,474] Trial 6 finished with value: 1.0573031693696975 and parameters: {'learning_rate': 0.005401511989987147, 'l1_coef': 0.030002285153082616, 'patience': 9, 'batch_size': 128, 'lr_factor': 0.16174068349319218, 'num_epochs': 429}. Best is trial 6 with value: 1.0573031693696975.
[I 2024-05-23 05:30:33,297] Trial 4 finished with value: 8.76876319371737 and parameters: {'learning_rate': 0.0644391813135624, 'l1_coef': 0.0675139528703857, 'patience': 13, 'batch_size': 32, 'lr_factor': 0.6759059960410283, 'num_epochs': 490}. Best is trial 6 with value: 1.0573031693696975.
[I 2024-05-23 05:30:56,236] Trial 10 finished with value: 0.7538234889507294 and parameters: {'learning_r

Chr 10 - Best hyperparameters: {'learning_rate': 0.0014297627276479063, 'l1_coef': 5.751396258572685e-07, 'patience': 16, 'batch_size': 32, 'lr_factor': 0.5495149199689441, 'num_epochs': 391}
Chr 10 - Best value: 0.6794


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 06:05:44,833] A new study created in RDB with name: unphased_full_23andMe_chr11_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr10/final_model_chr10.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr10/individual_r2_scores_chr10.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr10/individual_iqs_scores_chr10.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  919
PRS313 SNPs:  19
Total SNPs used for Training:  900


[I 2024-05-23 06:06:32,489] Trial 2 finished with value: 0.9868497763361249 and parameters: {'learning_rate': 0.04212134888797371, 'l1_coef': 3.9274155203967995e-05, 'patience': 5, 'batch_size': 64, 'lr_factor': 0.8082387699370034, 'num_epochs': 316}. Best is trial 2 with value: 0.9868497763361249.
[I 2024-05-23 06:07:41,391] Trial 8 finished with value: 0.730195426940918 and parameters: {'learning_rate': 0.005429112649009058, 'l1_coef': 1.6177027169406028e-07, 'patience': 17, 'batch_size': 256, 'lr_factor': 0.614085616813263, 'num_epochs': 195}. Best is trial 8 with value: 0.730195426940918.
[I 2024-05-23 06:07:53,463] Trial 3 finished with value: 0.6941146135330201 and parameters: {'learning_rate': 0.0013217720859759767, 'l1_coef': 1.180256392364302e-07, 'patience': 11, 'batch_size': 128, 'lr_factor': 0.2534027365830232, 'num_epochs': 184}. Best is trial 3 with value: 0.6941146135330201.
[I 2024-05-23 06:08:01,709] Trial 0 finished with value: 0.7457976043224335 and parameters: {'lea

Chr 11 - Best hyperparameters: {'learning_rate': 0.0002962230706837187, 'l1_coef': 3.075206801283935e-09, 'patience': 11, 'batch_size': 32, 'lr_factor': 0.39523935573010427, 'num_epochs': 489}
Chr 11 - Best value: 0.6390


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 06:42:20,116] A new study created in RDB with name: unphased_full_23andMe_chr12_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr11/final_model_chr11.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr11/individual_r2_scores_chr11.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr11/individual_iqs_scores_chr11.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  641
PRS313 SNPs:  17
Total SNPs used for Training:  624


[I 2024-05-23 06:43:21,562] Trial 1 finished with value: 0.8408280730247497 and parameters: {'learning_rate': 0.022623939934942532, 'l1_coef': 1.7020997140797295e-09, 'patience': 18, 'batch_size': 256, 'lr_factor': 0.7614753515256472, 'num_epochs': 388}. Best is trial 1 with value: 0.8408280730247497.
[I 2024-05-23 06:43:37,818] Trial 9 finished with value: 0.8957166612148285 and parameters: {'learning_rate': 0.025136787150366068, 'l1_coef': 0.0010696011678808964, 'patience': 12, 'batch_size': 256, 'lr_factor': 0.772329093850791, 'num_epochs': 308}. Best is trial 1 with value: 0.8408280730247497.
[I 2024-05-23 06:44:11,364] Trial 2 finished with value: 0.8070221781730652 and parameters: {'learning_rate': 0.011305424631940576, 'l1_coef': 2.715060730658116e-10, 'patience': 17, 'batch_size': 64, 'lr_factor': 0.6108290958293232, 'num_epochs': 356}. Best is trial 2 with value: 0.8070221781730652.
[I 2024-05-23 06:44:17,094] Trial 10 finished with value: 0.7559893369674683 and parameters: {'

Chr 12 - Best hyperparameters: {'learning_rate': 0.0006155078074723796, 'l1_coef': 3.8774472895354083e-07, 'patience': 7, 'batch_size': 64, 'lr_factor': 0.7007255392256676, 'num_epochs': 268}
Chr 12 - Best value: 0.7075


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 07:16:41,195] A new study created in RDB with name: unphased_full_23andMe_chr13_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr12/final_model_chr12.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr12/individual_r2_scores_chr12.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr12/individual_iqs_scores_chr12.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  112
PRS313 SNPs:  5
Total SNPs used for Training:  107


[I 2024-05-23 07:17:59,634] Trial 9 finished with value: 0.8241600375909073 and parameters: {'learning_rate': 0.02913786817115034, 'l1_coef': 0.014049887133628135, 'patience': 6, 'batch_size': 32, 'lr_factor': 0.8461631236005889, 'num_epochs': 454}. Best is trial 9 with value: 0.8241600375909073.
[I 2024-05-23 07:18:03,145] Trial 1 finished with value: 0.6552356064319611 and parameters: {'learning_rate': 0.004877840654457323, 'l1_coef': 2.975729152584155e-05, 'patience': 8, 'batch_size': 128, 'lr_factor': 0.7870862888839673, 'num_epochs': 363}. Best is trial 1 with value: 0.6552356064319611.
[I 2024-05-23 07:18:04,558] Trial 0 finished with value: 0.6513668230601719 and parameters: {'learning_rate': 0.0038384317177056755, 'l1_coef': 4.066016467461078e-10, 'patience': 7, 'batch_size': 64, 'lr_factor': 0.1931670550618998, 'num_epochs': 495}. Best is trial 0 with value: 0.6513668230601719.
[I 2024-05-23 07:18:42,441] Trial 6 finished with value: 0.6522824672552255 and parameters: {'learni

Chr 13 - Best hyperparameters: {'learning_rate': 0.02452264812309542, 'l1_coef': 1.6825247098958544e-07, 'patience': 18, 'batch_size': 64, 'lr_factor': 0.6085456184030417, 'num_epochs': 453}
Chr 13 - Best value: 0.6425


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 07:54:59,293] A new study created in RDB with name: unphased_full_23andMe_chr14_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr13/final_model_chr13.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr13/individual_r2_scores_chr13.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr13/individual_iqs_scores_chr13.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  164
PRS313 SNPs:  8
Total SNPs used for Training:  156


[I 2024-05-23 07:56:14,672] Trial 0 finished with value: 0.7962499499320984 and parameters: {'learning_rate': 0.001529138659454937, 'l1_coef': 6.604992067547774e-05, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.2180695235894331, 'num_epochs': 106}. Best is trial 0 with value: 0.7962499499320984.
[I 2024-05-23 07:56:29,530] Trial 1 finished with value: 0.7576070725917816 and parameters: {'learning_rate': 0.005564733034604567, 'l1_coef': 3.3056141780205077e-09, 'patience': 19, 'batch_size': 256, 'lr_factor': 0.5398223844700974, 'num_epochs': 125}. Best is trial 1 with value: 0.7576070725917816.
[I 2024-05-23 07:57:23,515] Trial 9 finished with value: 0.765370374917984 and parameters: {'learning_rate': 0.0014000061217567356, 'l1_coef': 1.3875062281694439e-08, 'patience': 11, 'batch_size': 256, 'lr_factor': 0.29036852712560235, 'num_epochs': 381}. Best is trial 1 with value: 0.7576070725917816.
[I 2024-05-23 07:57:54,341] Trial 6 finished with value: 0.7572415278508113 and parameters:

Chr 14 - Best hyperparameters: {'learning_rate': 0.013418255778350308, 'l1_coef': 2.2264136461525066e-05, 'patience': 18, 'batch_size': 32, 'lr_factor': 0.38397927743701044, 'num_epochs': 435}
Chr 14 - Best value: 0.7535


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 08:19:10,002] A new study created in RDB with name: unphased_full_23andMe_chr15_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr14/final_model_chr14.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr14/individual_r2_scores_chr14.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr14/individual_iqs_scores_chr14.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  200
PRS313 SNPs:  7
Total SNPs used for Training:  193


[I 2024-05-23 08:19:56,021] Trial 9 finished with value: 0.7004202893802097 and parameters: {'learning_rate': 0.015674177707208754, 'l1_coef': 9.372450031975693e-10, 'patience': 7, 'batch_size': 64, 'lr_factor': 0.4177496754239264, 'num_epochs': 231}. Best is trial 9 with value: 0.7004202893802097.
[I 2024-05-23 08:21:22,315] Trial 8 finished with value: 0.7491939097642899 and parameters: {'learning_rate': 0.0001672836925532961, 'l1_coef': 1.9964233179089297e-07, 'patience': 8, 'batch_size': 128, 'lr_factor': 0.25447372161156356, 'num_epochs': 216}. Best is trial 9 with value: 0.7004202893802097.
[I 2024-05-23 08:21:31,284] Trial 7 finished with value: 0.9459374053137642 and parameters: {'learning_rate': 0.07819935290297857, 'l1_coef': 0.006483446741401309, 'patience': 18, 'batch_size': 64, 'lr_factor': 0.38229536993411084, 'num_epochs': 235}. Best is trial 9 with value: 0.7004202893802097.
[I 2024-05-23 08:21:38,467] Trial 6 finished with value: 0.7884516775608063 and parameters: {'le

Chr 15 - Best hyperparameters: {'learning_rate': 0.009888091592593224, 'l1_coef': 7.464983038107919e-07, 'patience': 20, 'batch_size': 256, 'lr_factor': 0.6547700087831452, 'num_epochs': 299}
Chr 15 - Best value: 0.6778


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 08:59:06,769] A new study created in RDB with name: unphased_full_23andMe_chr16_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr15/final_model_chr15.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr15/individual_r2_scores_chr15.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr15/individual_iqs_scores_chr15.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  361
PRS313 SNPs:  14
Total SNPs used for Training:  347


[I 2024-05-23 09:00:59,786] Trial 0 finished with value: 0.69707270860672 and parameters: {'learning_rate': 0.0018920934899933782, 'l1_coef': 2.9108171228366007e-06, 'patience': 7, 'batch_size': 256, 'lr_factor': 0.19056401661879177, 'num_epochs': 233}. Best is trial 0 with value: 0.69707270860672.
[I 2024-05-23 09:01:16,600] Trial 7 finished with value: 0.6740764945745468 and parameters: {'learning_rate': 0.0019651437478597554, 'l1_coef': 2.406786095016464e-08, 'patience': 12, 'batch_size': 128, 'lr_factor': 0.6556104120779929, 'num_epochs': 193}. Best is trial 7 with value: 0.6740764945745468.
[I 2024-05-23 09:02:29,388] Trial 11 finished with value: 0.8744076933179581 and parameters: {'learning_rate': 0.05368696544032561, 'l1_coef': 3.6899636070861323e-09, 'patience': 11, 'batch_size': 64, 'lr_factor': 0.11709738612968518, 'num_epochs': 219}. Best is trial 7 with value: 0.6740764945745468.
[I 2024-05-23 09:02:46,850] Trial 2 finished with value: 0.6817042396618771 and parameters: {'

Chr 16 - Best hyperparameters: {'learning_rate': 0.0020175034839549004, 'l1_coef': 1.649422460381251e-07, 'patience': 19, 'batch_size': 64, 'lr_factor': 0.7240335076638379, 'num_epochs': 248}
Chr 16 - Best value: 0.6604


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 09:28:05,729] A new study created in RDB with name: unphased_full_23andMe_chr17_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr16/final_model_chr16.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr16/individual_r2_scores_chr16.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr16/individual_iqs_scores_chr16.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  188
PRS313 SNPs:  9
Total SNPs used for Training:  179


[I 2024-05-23 09:29:05,925] Trial 1 finished with value: 0.8422214090824127 and parameters: {'learning_rate': 0.05662851731080405, 'l1_coef': 0.0071650759766779286, 'patience': 8, 'batch_size': 128, 'lr_factor': 0.3947102172561444, 'num_epochs': 411}. Best is trial 1 with value: 0.8422214090824127.
[I 2024-05-23 09:29:56,150] Trial 6 finished with value: 0.6812757061077999 and parameters: {'learning_rate': 0.010181808270071877, 'l1_coef': 1.9861497695612812e-05, 'patience': 18, 'batch_size': 32, 'lr_factor': 0.8029672443423692, 'num_epochs': 396}. Best is trial 6 with value: 0.6812757061077999.
[I 2024-05-23 09:30:50,557] Trial 0 finished with value: 0.6873753160238266 and parameters: {'learning_rate': 0.00018693240812657676, 'l1_coef': 1.3847744667082167e-06, 'patience': 10, 'batch_size': 128, 'lr_factor': 0.5221406960042503, 'num_epochs': 268}. Best is trial 6 with value: 0.6812757061077999.
[I 2024-05-23 09:31:10,572] Trial 3 finished with value: 0.6768181204795838 and parameters: {

Chr 17 - Best hyperparameters: {'learning_rate': 0.0032661505291700322, 'l1_coef': 7.96191891885754e-09, 'patience': 13, 'batch_size': 128, 'lr_factor': 0.8078850819388251, 'num_epochs': 218}
Chr 17 - Best value: 0.6711
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr17/final_model_chr17.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr17/individual_r2_scores_chr17.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr17/individual_iqs_scores_chr17.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  277
PRS313 SNPs:  9
Total SNPs used for Training:  268


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 09:54:12,563] A new study created in RDB with name: unphased_full_23andMe_chr18_study_logistic_regression
[I 2024-05-23 09:55:16,679] Trial 8 finished with value: 0.7049856305122375 and parameters: {'learning_rate': 0.0034400693195760956, 'l1_coef': 2.395671659507723e-06, 'patience': 5, 'batch_size': 256, 'lr_factor': 0.29884066012745725, 'num_epochs': 164}. Best is trial 8 with value: 0.7049856305122375.
[I 2024-05-23 09:55:21,420] Trial 9 finished with value: 0.7008734434843064 and parameters: {'learning_rate': 0.012841773534334038, 'l1_coef': 2.2181286380772192e-06, 'patience': 13, 'batch_size': 128, 'lr_factor': 0.24594562671118833, 'num_epochs': 169}. Best is trial 9 with value: 0.7008734434843064.
[I 2024-05-23 09:55:41,407] Trial 1 finished with value: 0.8334831297397614 and parameters: {'learning_rate': 0.027388054586999284, 'l1_coef': 4.413596227351853e-10, 'patience': 18, 'batch

Chr 18 - Best hyperparameters: {'learning_rate': 0.003265608490544992, 'l1_coef': 8.369409692394013e-09, 'patience': 17, 'batch_size': 32, 'lr_factor': 0.7976113624125284, 'num_epochs': 427}
Chr 18 - Best value: 0.6563
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr18/final_model_chr18.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr18/individual_r2_scores_chr18.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr18/individual_iqs_scores_chr18.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  288
PRS313 SNPs:  7
Total SNPs used for Training:  281


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 10:30:50,307] A new study created in RDB with name: unphased_full_23andMe_chr19_study_logistic_regression
[I 2024-05-23 10:31:29,325] Trial 4 finished with value: 0.7989994525909424 and parameters: {'learning_rate': 0.02013670021681599, 'l1_coef': 3.172563198433545e-08, 'patience': 8, 'batch_size': 256, 'lr_factor': 0.7155551029031453, 'num_epochs': 338}. Best is trial 4 with value: 0.7989994525909424.
[I 2024-05-23 10:31:47,856] Trial 2 finished with value: 0.9927492550441197 and parameters: {'learning_rate': 0.0453372432982793, 'l1_coef': 2.7933911540607385e-10, 'patience': 12, 'batch_size': 64, 'lr_factor': 0.22644295340642753, 'num_epochs': 324}. Best is trial 4 with value: 0.7989994525909424.
[I 2024-05-23 10:32:17,419] Trial 3 finished with value: 0.9738792487553187 and parameters: {'learning_rate': 0.00358814636719523, 'l1_coef': 0.013658386387999533, 'patience': 7, 'batch_size': 6

Chr 19 - Best hyperparameters: {'learning_rate': 0.0005699600577620382, 'l1_coef': 2.1247931949559007e-08, 'patience': 9, 'batch_size': 32, 'lr_factor': 0.7547222609319668, 'num_epochs': 339}
Chr 19 - Best value: 0.6213


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 11:11:59,756] A new study created in RDB with name: unphased_full_23andMe_chr20_study_logistic_regression


Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr19/final_model_chr19.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr19/individual_r2_scores_chr19.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr19/individual_iqs_scores_chr19.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  62
PRS313 SNPs:  4
Total SNPs used for Training:  58


[I 2024-05-23 11:14:40,395] Trial 8 finished with value: 0.6391972184181214 and parameters: {'learning_rate': 0.011987920813509946, 'l1_coef': 2.3213538860849817e-06, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.3687000334218582, 'num_epochs': 457}. Best is trial 8 with value: 0.6391972184181214.
[I 2024-05-23 11:14:56,507] Trial 1 finished with value: 0.7809470415115356 and parameters: {'learning_rate': 0.061876952241310845, 'l1_coef': 0.0606669214515346, 'patience': 11, 'batch_size': 128, 'lr_factor': 0.18851297502125358, 'num_epochs': 214}. Best is trial 8 with value: 0.6391972184181214.
[I 2024-05-23 11:16:41,849] Trial 9 finished with value: 0.705214011669159 and parameters: {'learning_rate': 7.521276198884805e-05, 'l1_coef': 4.907221442378259e-10, 'patience': 6, 'batch_size': 128, 'lr_factor': 0.4064474947532004, 'num_epochs': 183}. Best is trial 8 with value: 0.6391972184181214.
[I 2024-05-23 11:16:58,044] Trial 10 finished with value: 0.6395760398644667 and parameters: {'l

Chr 20 - Best hyperparameters: {'learning_rate': 0.009741141887525059, 'l1_coef': 5.086408968461716e-08, 'patience': 19, 'batch_size': 32, 'lr_factor': 0.2989080662232163, 'num_epochs': 470}
Chr 20 - Best value: 0.6340
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr20/final_model_chr20.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr20/individual_r2_scores_chr20.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr20/individual_iqs_scores_chr20.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  44
PRS313 SNPs:  4
Total SNPs used for Training:  40


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 11:47:18,628] A new study created in RDB with name: unphased_full_23andMe_chr21_study_logistic_regression
[I 2024-05-23 11:48:18,055] Trial 6 finished with value: 0.7160479247570037 and parameters: {'learning_rate': 0.045724692439618665, 'l1_coef': 7.881446682339455e-09, 'patience': 15, 'batch_size': 256, 'lr_factor': 0.30953500093983594, 'num_epochs': 132}. Best is trial 6 with value: 0.7160479247570037.
[I 2024-05-23 11:49:27,296] Trial 5 finished with value: 0.7142757599170391 and parameters: {'learning_rate': 0.02669869306219022, 'l1_coef': 8.604820920110499e-08, 'patience': 16, 'batch_size': 32, 'lr_factor': 0.645036029775653, 'num_epochs': 262}. Best is trial 5 with value: 0.7142757599170391.
[I 2024-05-23 11:52:00,179] Trial 3 finished with value: 0.7493791733469282 and parameters: {'learning_rate': 0.0026946779906731424, 'l1_coef': 0.00523662900845539, 'patience': 10, 'batch_size'

Chr 21 - Best hyperparameters: {'learning_rate': 0.06417463659123403, 'l1_coef': 1.4994401968392755e-08, 'patience': 20, 'batch_size': 64, 'lr_factor': 0.8953661415766634, 'num_epochs': 117}
Chr 21 - Best value: 0.6684
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr21/final_model_chr21.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr21/individual_r2_scores_chr21.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr21/individual_iqs_scores_chr21.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  443
PRS313 SNPs:  11
Total SNPs used for Training:  432


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)
[I 2024-05-23 12:21:01,776] Trial 22 finished with value: 0.9262316722136278 and parameters: {'learning_rate': 0.05079781889732127, 'l1_coef': 0.006007062333265984, 'patience': 10, 'batch_size': 32, 'lr_factor': 0.1757661398292724, 'num_epochs': 306}. Best is trial 3 with value: 0.7342786401510238.
[I 2024-05-23 12:21:10,050] Trial 23 finished with value: 0.696631726196834 and parameters: {'learning_rate': 0.0004549303724999769, 'l1_coef': 3.038022393227062e-06, 'patience': 9, 'batch_size': 64, 'lr_factor': 0.6520813926376375, 'num_epochs': 102}. Best is trial 23 with value: 0.696631726196834.
[I 2024-05-23 12:21:29,468] Trial 21 finished with value: 0.7450952172279358 and parameters: {'learning_rate': 0.039819600609845504, 'l1_coef': 3.9580278352835204e-05, 'patience': 13, 'batch_size': 256, 'lr_factor': 0.17897184287

Chr 22 - Best hyperparameters: {'learning_rate': 0.0008842867535788688, 'l1_coef': 3.953662605180574e-10, 'patience': 15, 'batch_size': 32, 'lr_factor': 0.7809338408689392, 'num_epochs': 364}
Chr 22 - Best value: 0.6481
Final model saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/models_unphased/chr22/final_model_chr22.pth
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr22/individual_r2_scores_chr22.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr22/individual_iqs_scores_chr22.csv
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv


  chi_squared = np.sum((contingency_table - expected_counts) ** 2 / expected_counts)


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Folders for saved models and CSV files
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

def calculate_iqs_unphased(y_true, y_pred):
    # Dummy implementation of IQS calculation, replace with the actual function
    return np.mean(np.abs(y_true - y_pred))

for chromosome_number in range(start, 23):
    # Paths for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.long)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the multinomial logistic regression model with L1 regularization
    class MulticlassLogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, num_classes=3, l1_coef=0.0):
            super(MulticlassLogisticRegression, self).__init__()
            self.num_classes = num_classes
            self.linear = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_classes)])
            self.l1_coef = l1_coef

        def forward(self, x):
            out = torch.stack([linear(x) for linear in self.linear], dim=-1)
            out = nn.functional.softmax(out, dim=-1)
            return out

        def l1_loss(self):
            return self.l1_coef * sum(torch.norm(linear.weight, p=1) for linear in self.linear)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the final model
    model = MulticlassLogisticRegression(X_train_val.shape[1], y_train_val.shape[1])
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)
    model.eval()

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_outputs = test_outputs.argmax(dim=-1)
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

# Create a DataFrame to store the performance metrics for each chromosome
performance_df = pd.DataFrame({
    'Chromosome': list(range(start, 23)),
    'R2 Score': r2_scores,
    'IQS Score': iqs_scores,
    'Accuracy Score': accuracy_scores
})

# Save the performance metrics to a CSV file
performance_csv_file = csv_folder + 'performance_metrics.csv'
performance_df.to_csv(performance_csv_file, index=False)
print(f"Performance metrics saved at: {performance_csv_file}")


In [25]:
# Loop through all the training datasets and document the PRS313 SNPs in each dataset. Save this to a CSV file.

import pandas as pd
import os

data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize a list to store the PRS313 SNPs in each dataset
prs313_snps = []

for chromosome_number in range(1, 23):
    file_name = data_directory + \
        f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    prs313_snps.append(data.filter(regex='PRS313_').columns)

# Create a DataFrame to store the PRS313 SNPs in each dataset
prs313_df = pd.DataFrame({
    'Chromosome': list(range(1, 23)),
    'PRS313 SNPs': prs313_snps,
    "Number of PRS313 SNPs": [len(snps) for snps in prs313_snps]
})

# Save the PRS313 SNPs to a CSV file
prs313_csv_file = output_folder + 'prs313_snps.csv'
prs313_df.to_csv(prs313_csv_file, index=False)
print(f"PRS313 SNPs saved at: {prs313_csv_file}")

# Print the total number of PRS313 SNPs in all datasets
total_prs313_snps = sum(prs313_df["Number of PRS313 SNPs"])
print(f"Total number of PRS313 SNPs: {total_prs313_snps}")



PRS313 SNPs saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/prs313_snps.csv
Total number of PRS313 SNPs: 313
