# Logistic Regression Model Unphased

In [2]:
import numpy as np

def calculate_iqs_unphased(true_genotypes, imputed_genotypes):
    """
    Calculate the Imputation Quality Score (IQS) for ununphased genotypes.

    Args:
        true_genotypes (numpy.ndarray): 2D array of true genotypes, where each genotype is represented by values 0, 1, or 2.
        imputed_genotypes (numpy.ndarray): 2D array of imputed genotypes, where each genotype is represented by floating-point values between 0 and 2.
        threshold (float): Threshold for converting imputed probabilities to discrete genotypes (default: 0.5).

    Returns:
        float: Imputation Quality Score (IQS).
    """
    # Check if the shapes of true and imputed genotypes are the same
    if true_genotypes.shape != imputed_genotypes.shape:
        raise ValueError("Shape of true genotypes and imputed genotypes must be the same.")

    # Convert imputed probabilities to discrete genotypes based on the threshold
    imputed_discrete = np.round(imputed_genotypes).astype(int)

    # Create a contingency table
    contingency_table = np.zeros((3, 3), dtype=int)

    # Fill the contingency table
    for true_geno, imputed_geno in zip(true_genotypes, imputed_discrete):
        for true_allele, imputed_allele in zip(true_geno, imputed_geno):
            contingency_table[int(true_allele), int(imputed_allele)] += 1

    # Calculate the total number of alleles
    total_alleles = np.sum(contingency_table)

    # Calculate the observed agreement (Po)
    po = np.sum(np.diag(contingency_table)) / total_alleles

    # Calculate the expected agreement by chance (Pc)
    true_counts = np.sum(contingency_table, axis=1)
    imputed_counts = np.sum(contingency_table, axis=0)
    pc = np.sum(true_counts * imputed_counts) / (total_alleles ** 2)

    # Calculate the Imputation Quality Score (IQS)
    iqs = (po - pc) / (1 - pc)

    return iqs

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import r2_score as sklearn_r2_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
accuracies = []
precisions = []
recalls = []
false_positive_rates = []
auc_rocs = []
r2_scores = []
iqs_scores = []

# Create folders for saving files
output_folder = "../../Data/model_results_unphased/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"
curve_folder = output_folder + "roc_curves/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(curve_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"
    chr_curve_folder = curve_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)
    os.makedirs(chr_curve_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)


    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*Unknown)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='Unknown').values, dtype=torch.float32)

    print("Unknown PRS313 SNPs: ", y.shape[1])
    print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
    print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the logistic regression model with lasso regularization
    class LogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.sigmoid = nn.Sigmoid()
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            out = self.sigmoid(out)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)
        
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500
    batch_size = 128

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-5, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])

        model = LogisticRegression(input_dim, output_dim, l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.BCELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val.argmax(dim=1))):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs, batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        # print(f"Early stopping at epoch {epoch+1}")
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)

    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"chr{chromosome_number}_study"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists

    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")
    
    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)

    study.optimize(objective, n_trials=25, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']

    model = LogisticRegression(input_dim, output_dim, best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.BCELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_preds = (test_outputs > 0.5).float()
        test_accuracy = float(((test_preds > 0.5) == y_test).float().mean())
        test_precision = precision_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_recall = recall_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1 = f1_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_roc_auc = roc_auc_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), average='micro')
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())

        # Calculate false positive rate
        cm = confusion_matrix(y_test.cpu().numpy().ravel(), test_preds.cpu().numpy().ravel())
        tn, fp, fn, tp = cm.ravel()
        test_fpr = fp / (fp + tn)

        # Append performance metrics to the lists
        accuracies.append(test_accuracy)
        precisions.append(test_precision)
        recalls.append(test_recall)
        false_positive_rates.append(test_fpr)
        auc_rocs.append(test_roc_auc)
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='Unknown').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

        # Save individual AUC ROC curves for each SNP
        for i, snp in enumerate(snp_names):
            try: 
                fpr, tpr, _ = roc_curve(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i])
                plt.figure()
                plt.plot(fpr, tpr, label=f'AUC ROC = {roc_auc_score(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i]):.4f}')
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'AUC ROC Curve - {snp}')
                plt.legend()
                
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chromosome_number}.png'
                plt.savefig(curve_file)
                plt.close()
            except ValueError:
                # Save a placeholder image if there is insufficient data
                plt.figure()
                plt.axis('off')
                plt.text(0.5, 0.5, "Insufficient data for ROC curve", ha='center', va='center')
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chromosome_number}.png'
                plt.savefig(curve_file)
                plt.close()

                print(f"Skipping SNP {snp} due to insufficient data")


        print(f"Individual AUC ROC curves saved in: {curve_folder}")

        # Create a DataFrame to store the performance metrics for each chromosome
        performance_df = pd.DataFrame({
            'Chromosome': list(range(start, chromosome_number + 1)),
            'Accuracy': accuracies,
            'Precision': precisions,
            'Recall': recalls,
            'False Positive Rate': false_positive_rates,
            'AUC ROC': auc_rocs,
            'R2 Score': r2_scores,
            'IQS Score': iqs_scores
        })

        # Save the performance metrics to a CSV file
        performance_csv_file = csv_folder + 'performance_metrics.csv'
        performance_df.to_csv(performance_csv_file, index=False)
        print(f"Performance metrics saved at: {performance_csv_file}")

Unknown PRS313 SNPs:  30
Known PRS313 SNPs:  10
23AndMe SNPs with LD to Unknown PRS313 SNPs:  850
Total SNPs used for Training:  850


[I 2024-04-30 10:11:23,397] Trial 136 finished with value: 0.24072570720544229 and parameters: {'learning_rate': 0.06244280451709654, 'l1_coef': 1.2869733065346573e-05, 'patience': 16, 'batch_size': 32}. Best is trial 32 with value: 0.08649516483912101.


Chr 1 - Best hyperparameters: {'learning_rate': 0.01823591759267881, 'l1_coef': 1.0049346589997416e-05, 'patience': 9, 'batch_size': 32}
Chr 1 - Best value: 0.0865
Epoch [1/500], Train Loss: 0.6092
Epoch [2/500], Train Loss: 0.3647
Epoch [3/500], Train Loss: 0.3238
Epoch [4/500], Train Loss: 0.2944
Epoch [5/500], Train Loss: 0.2686
Epoch [6/500], Train Loss: 0.2754
Epoch [7/500], Train Loss: 0.2642
Epoch [8/500], Train Loss: 0.2579
Epoch [9/500], Train Loss: 0.2536
Epoch [10/500], Train Loss: 0.2070
Epoch [11/500], Train Loss: 0.1987
Epoch [12/500], Train Loss: 0.1979
Epoch [13/500], Train Loss: 0.1946
Epoch [14/500], Train Loss: 0.1905
Epoch [15/500], Train Loss: 0.1923
Epoch [16/500], Train Loss: 0.1934
Epoch [17/500], Train Loss: 0.1864
Epoch [18/500], Train Loss: 0.1898
Epoch [19/500], Train Loss: 0.1831
Epoch [20/500], Train Loss: 0.1852
Epoch [21/500], Train Loss: 0.1829
Epoch [22/500], Train Loss: 0.1796
Epoch [23/500], Train Loss: 0.1805
Epoch [24/500], Train Loss: 0.1769
Epoch

[I 2024-04-30 10:11:28,262] Trial 53 finished with value: 8.442827844619751 and parameters: {'learning_rate': 0.08866024484499739, 'l1_coef': 1.9153174210294036e-05, 'patience': 7, 'batch_size': 128}. Best is trial 50 with value: 0.11959010179226218.


Chr 2 - Best hyperparameters: {'learning_rate': 0.008236314117298912, 'l1_coef': 1.0089572204728918e-05, 'patience': 10, 'batch_size': 32}
Chr 2 - Best value: 0.1196
Epoch [1/500], Train Loss: 0.4278
Epoch [2/500], Train Loss: 0.3145
Epoch [3/500], Train Loss: 0.2757
Epoch [4/500], Train Loss: 0.2468
Epoch [5/500], Train Loss: 0.2339
Epoch [6/500], Train Loss: 0.2202
Epoch [7/500], Train Loss: 0.2112
Epoch [8/500], Train Loss: 0.2029
Epoch [9/500], Train Loss: 0.1988
Epoch [10/500], Train Loss: 0.1932
Epoch [11/500], Train Loss: 0.1907
Epoch [12/500], Train Loss: 0.1849
Epoch [13/500], Train Loss: 0.1821
Epoch [14/500], Train Loss: 0.1792
Epoch [15/500], Train Loss: 0.1770
Epoch [16/500], Train Loss: 0.1747
Epoch [17/500], Train Loss: 0.1721
Epoch [18/500], Train Loss: 0.1711
Epoch [19/500], Train Loss: 0.1684
Epoch [20/500], Train Loss: 0.1653
Epoch [21/500], Train Loss: 0.1664
Epoch [22/500], Train Loss: 0.1672
Epoch [23/500], Train Loss: 0.1641
Epoch [24/500], Train Loss: 0.1623
Epo

KeyboardInterrupt: 

# ALL PRS Masked

In [6]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import r2_score as sklearn_r2_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
accuracies = []
precisions = []
recalls = []
false_positive_rates = []
auc_rocs = []
r2_scores = []
iqs_scores = []

# Create folders for saving files
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"
curve_folder = output_folder + "roc_curves/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(curve_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"
    chr_curve_folder = curve_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)
    os.makedirs(chr_curve_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)


    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)


    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    # print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
    # print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the logistic regression model with lasso regularization
    class LogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.sigmoid = nn.Sigmoid()
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            out = self.sigmoid(out)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)
        
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500
    batch_size = 128

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-5, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])

        model = LogisticRegression(input_dim, output_dim, l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.BCELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val.argmax(dim=1))):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs, batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        # print(f"Early stopping at epoch {epoch+1}")
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)

    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"chr{chromosome_number}_study"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists

    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")
    
    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)

    study.optimize(objective, n_trials=1, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']

    model = LogisticRegression(input_dim, output_dim, best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.BCELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        # print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            # print(f"Early stopping at epoch {epoch+1}")
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_preds = (test_outputs > 0.5).float()
        test_accuracy = float(((test_preds > 0.5) == y_test).float().mean())
        test_precision = precision_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_recall = recall_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1 = f1_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_roc_auc = roc_auc_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), average='micro')
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())

        # Calculate false positive rate
        cm = confusion_matrix(y_test.cpu().numpy().ravel(), test_preds.cpu().numpy().ravel())
        tn, fp, fn, tp = cm.ravel()
        test_fpr = fp / (fp + tn)

        # Append performance metrics to the lists
        accuracies.append(test_accuracy)
        precisions.append(test_precision)
        recalls.append(test_recall)
        false_positive_rates.append(test_fpr)
        auc_rocs.append(test_roc_auc)
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='Unknown').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

        # Save individual AUC ROC curves for each SNP
        for i, snp in enumerate(snp_names):
            try: 
                fpr, tpr, _ = roc_curve(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i])
                plt.figure()
                plt.plot(fpr, tpr, label=f'AUC ROC = {roc_auc_score(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i]):.4f}')
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'AUC ROC Curve - {snp}')
                plt.legend()
                
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chromosome_number}.png'
                plt.savefig(curve_file)
                plt.close()
            except ValueError:
                # Save a placeholder image if there is insufficient data
                plt.figure()
                plt.axis('off')
                plt.text(0.5, 0.5, "Insufficient data for ROC curve", ha='center', va='center')
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chromosome_number}.png'
                plt.savefig(curve_file)
                plt.close()

                print(f"Skipping SNP {snp} due to insufficient data")


        print(f"Individual AUC ROC curves saved in: {curve_folder}")

        # Create a DataFrame to store the performance metrics for each chromosome
        performance_df = pd.DataFrame({
            'Chromosome': list(range(start, chromosome_number + 1)),
            'Accuracy': accuracies,
            'Precision': precisions,
            'Recall': recalls,
            'False Positive Rate': false_positive_rates,
            'AUC ROC': auc_rocs,
            'R2 Score': r2_scores,
            'IQS Score': iqs_scores
        })

        # Save the performance metrics to a CSV file
        performance_csv_file = csv_folder + 'performance_metrics.csv'
        performance_df.to_csv(performance_csv_file, index=False)
        print(f"Performance metrics saved at: {performance_csv_file}")

Total SNPs:  880
PRS313 SNPs:  30
Total SNPs used for Training:  850


[I 2024-04-30 10:13:55,609] Trial 138 finished with value: 0.18939957183140976 and parameters: {'learning_rate': 0.024010253494540893, 'l1_coef': 1.3182032660444333e-05, 'patience': 7, 'batch_size': 32}. Best is trial 32 with value: 0.08649516483912101.


Chr 1 - Best hyperparameters: {'learning_rate': 0.01823591759267881, 'l1_coef': 1.0049346589997416e-05, 'patience': 9, 'batch_size': 32}
Chr 1 - Best value: 0.0865
Epoch [1/500], Train Loss: 0.5700
Epoch [2/500], Train Loss: 0.3623
Epoch [3/500], Train Loss: 0.3167
Epoch [4/500], Train Loss: 0.2940
Epoch [5/500], Train Loss: 0.2642
Epoch [6/500], Train Loss: 0.2285
Epoch [7/500], Train Loss: 0.2209
Epoch [8/500], Train Loss: 0.2134
Epoch [9/500], Train Loss: 0.2108
Epoch [10/500], Train Loss: 0.2026
Epoch [11/500], Train Loss: 0.1973
Epoch [12/500], Train Loss: 0.1928
Epoch [13/500], Train Loss: 0.1903
Epoch [14/500], Train Loss: 0.1890
Epoch [15/500], Train Loss: 0.1922
Epoch [16/500], Train Loss: 0.1922
Epoch [17/500], Train Loss: 0.1944
Epoch [18/500], Train Loss: 0.1883
Epoch [19/500], Train Loss: 0.1875
Epoch [20/500], Train Loss: 0.1876
Epoch [21/500], Train Loss: 0.1903
Epoch [22/500], Train Loss: 0.1797
Epoch [23/500], Train Loss: 0.1770
Epoch [24/500], Train Loss: 0.1848
Epoch

[I 2024-04-30 10:14:02,317] Trial 54 finished with value: 0.17957539604260372 and parameters: {'learning_rate': 0.019514807663622482, 'l1_coef': 1.9827302273015474e-05, 'patience': 9, 'batch_size': 32}. Best is trial 50 with value: 0.11959010179226218.


Chr 2 - Best hyperparameters: {'learning_rate': 0.008236314117298912, 'l1_coef': 1.0089572204728918e-05, 'patience': 10, 'batch_size': 32}
Chr 2 - Best value: 0.1196
Epoch [1/500], Train Loss: 0.4341
Epoch [2/500], Train Loss: 0.3183
Epoch [3/500], Train Loss: 0.2745
Epoch [4/500], Train Loss: 0.2505
Epoch [5/500], Train Loss: 0.2330
Epoch [6/500], Train Loss: 0.2221
Epoch [7/500], Train Loss: 0.2127
Epoch [8/500], Train Loss: 0.2058
Epoch [9/500], Train Loss: 0.1977
Epoch [10/500], Train Loss: 0.1920
Epoch [11/500], Train Loss: 0.1893
Epoch [12/500], Train Loss: 0.1855
Epoch [13/500], Train Loss: 0.1825
Epoch [14/500], Train Loss: 0.1787
Epoch [15/500], Train Loss: 0.1767
Epoch [16/500], Train Loss: 0.1767
Epoch [17/500], Train Loss: 0.1718
Epoch [18/500], Train Loss: 0.1712
Epoch [19/500], Train Loss: 0.1713
Epoch [20/500], Train Loss: 0.1701
Epoch [21/500], Train Loss: 0.1661
Epoch [22/500], Train Loss: 0.1652
Epoch [23/500], Train Loss: 0.1638
Epoch [24/500], Train Loss: 0.1626
Epo

  iqs = (po - pc) / (1 - pc)


Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_r2_scores_chr2.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_iqs_scores_chr2.csv




Skipping SNP chr2_217955896_GA_G_PRS313_Unknown_combined due to insufficient data
Individual AUC ROC curves saved in: ../../Data/model_results_unphased_all_PRS/logistic_regression/roc_curves/
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  947
PRS313 SNPs:  16
Total SNPs used for Training:  931


[I 2024-04-30 10:14:09,398] Trial 52 finished with value: 0.1572665859013796 and parameters: {'learning_rate': 0.0011689403608019387, 'l1_coef': 1.751091597056413e-05, 'patience': 16, 'batch_size': 128}. Best is trial 11 with value: 0.04525891537112849.


Chr 3 - Best hyperparameters: {'learning_rate': 0.01168727761615579, 'l1_coef': 1.1084312581961695e-05, 'patience': 15, 'batch_size': 64}
Chr 3 - Best value: 0.0453
Epoch [1/500], Train Loss: 0.5049
Epoch [2/500], Train Loss: 0.2854
Epoch [3/500], Train Loss: 0.2288
Epoch [4/500], Train Loss: 0.2009
Epoch [5/500], Train Loss: 0.1854
Epoch [6/500], Train Loss: 0.1680
Epoch [7/500], Train Loss: 0.1581
Epoch [8/500], Train Loss: 0.1535
Epoch [9/500], Train Loss: 0.1421
Epoch [10/500], Train Loss: 0.1342
Epoch [11/500], Train Loss: 0.1327
Epoch [12/500], Train Loss: 0.1274
Epoch [13/500], Train Loss: 0.1213
Epoch [14/500], Train Loss: 0.1199
Epoch [15/500], Train Loss: 0.1184
Epoch [16/500], Train Loss: 0.1148
Epoch [17/500], Train Loss: 0.1140
Epoch [18/500], Train Loss: 0.1125
Epoch [19/500], Train Loss: 0.1081
Epoch [20/500], Train Loss: 0.1035
Epoch [21/500], Train Loss: 0.1030
Epoch [22/500], Train Loss: 0.1058
Epoch [23/500], Train Loss: 0.1004
Epoch [24/500], Train Loss: 0.0990
Epoc

[I 2024-04-30 10:14:16,702] Trial 52 finished with value: 0.15921089889911505 and parameters: {'learning_rate': 0.0011741904885725096, 'l1_coef': 6.940710723028401e-05, 'patience': 17, 'batch_size': 32}. Best is trial 6 with value: 0.05639927589467593.


Chr 4 - Best hyperparameters: {'learning_rate': 0.010022005750497007, 'l1_coef': 1.0999251330619296e-05, 'patience': 13, 'batch_size': 64}
Chr 4 - Best value: 0.0564
Epoch [1/500], Train Loss: 0.5581
Epoch [2/500], Train Loss: 0.3583
Epoch [3/500], Train Loss: 0.2870
Epoch [4/500], Train Loss: 0.2443
Epoch [5/500], Train Loss: 0.2133
Epoch [6/500], Train Loss: 0.1965
Epoch [7/500], Train Loss: 0.1809
Epoch [8/500], Train Loss: 0.1709
Epoch [9/500], Train Loss: 0.1611
Epoch [10/500], Train Loss: 0.1525
Epoch [11/500], Train Loss: 0.1429
Epoch [12/500], Train Loss: 0.1360
Epoch [13/500], Train Loss: 0.1312
Epoch [14/500], Train Loss: 0.1269
Epoch [15/500], Train Loss: 0.1247
Epoch [16/500], Train Loss: 0.1187
Epoch [17/500], Train Loss: 0.1143
Epoch [18/500], Train Loss: 0.1102
Epoch [19/500], Train Loss: 0.1097
Epoch [20/500], Train Loss: 0.1051
Epoch [21/500], Train Loss: 0.1052
Epoch [22/500], Train Loss: 0.1000
Epoch [23/500], Train Loss: 0.0959
Epoch [24/500], Train Loss: 0.0953
Epo

[I 2024-04-30 10:14:23,896] Trial 52 finished with value: 0.17206094094685148 and parameters: {'learning_rate': 0.0016840216950022215, 'l1_coef': 1.1820460713931181e-05, 'patience': 7, 'batch_size': 64}. Best is trial 34 with value: 0.07268482106072562.


Chr 5 - Best hyperparameters: {'learning_rate': 0.010757867162206202, 'l1_coef': 1.0440017068267564e-05, 'patience': 9, 'batch_size': 64}
Chr 5 - Best value: 0.0727
Epoch [1/500], Train Loss: 0.6085
Epoch [2/500], Train Loss: 0.3667
Epoch [3/500], Train Loss: 0.3008
Epoch [4/500], Train Loss: 0.2636
Epoch [5/500], Train Loss: 0.2407
Epoch [6/500], Train Loss: 0.2259
Epoch [7/500], Train Loss: 0.2077
Epoch [8/500], Train Loss: 0.1954
Epoch [9/500], Train Loss: 0.1863
Epoch [10/500], Train Loss: 0.1785
Epoch [11/500], Train Loss: 0.1731
Epoch [12/500], Train Loss: 0.1693
Epoch [13/500], Train Loss: 0.1611
Epoch [14/500], Train Loss: 0.1574
Epoch [15/500], Train Loss: 0.1541
Epoch [16/500], Train Loss: 0.1515
Epoch [17/500], Train Loss: 0.1498
Epoch [18/500], Train Loss: 0.1456
Epoch [19/500], Train Loss: 0.1405
Epoch [20/500], Train Loss: 0.1392
Epoch [21/500], Train Loss: 0.1400
Epoch [22/500], Train Loss: 0.1350
Epoch [23/500], Train Loss: 0.1319
Epoch [24/500], Train Loss: 0.1315
Epoc

[I 2024-04-30 10:14:37,414] Trial 52 finished with value: 0.1935613960027695 and parameters: {'learning_rate': 0.00022440703253433187, 'l1_coef': 3.637667356288873e-05, 'patience': 11, 'batch_size': 64}. Best is trial 47 with value: 0.14764729972396579.


Chr 6 - Best hyperparameters: {'learning_rate': 0.0005159914294373423, 'l1_coef': 1.0687801532020404e-05, 'patience': 8, 'batch_size': 64}
Chr 6 - Best value: 0.1476
Epoch [1/500], Train Loss: 0.5457
Epoch [2/500], Train Loss: 0.4890
Epoch [3/500], Train Loss: 0.4712
Epoch [4/500], Train Loss: 0.4564
Epoch [5/500], Train Loss: 0.4434
Epoch [6/500], Train Loss: 0.4314
Epoch [7/500], Train Loss: 0.4214
Epoch [8/500], Train Loss: 0.4116
Epoch [9/500], Train Loss: 0.4025
Epoch [10/500], Train Loss: 0.3952
Epoch [11/500], Train Loss: 0.3869
Epoch [12/500], Train Loss: 0.3794
Epoch [13/500], Train Loss: 0.3720
Epoch [14/500], Train Loss: 0.3654
Epoch [15/500], Train Loss: 0.3590
Epoch [16/500], Train Loss: 0.3525
Epoch [17/500], Train Loss: 0.3469
Epoch [18/500], Train Loss: 0.3417
Epoch [19/500], Train Loss: 0.3358
Epoch [20/500], Train Loss: 0.3316
Epoch [21/500], Train Loss: 0.3265
Epoch [22/500], Train Loss: 0.3224
Epoch [23/500], Train Loss: 0.3187
Epoch [24/500], Train Loss: 0.3148
Epo

[I 2024-04-30 10:14:51,258] Trial 51 finished with value: 0.23524432549109825 and parameters: {'learning_rate': 0.00030856565095044357, 'l1_coef': 3.202290357425731e-05, 'patience': 13, 'batch_size': 32}. Best is trial 22 with value: 0.10665186093403742.


Chr 7 - Best hyperparameters: {'learning_rate': 0.00022929815822353796, 'l1_coef': 1.1733499979155006e-05, 'patience': 12, 'batch_size': 32}
Chr 7 - Best value: 0.1067
Epoch [1/500], Train Loss: 0.6075
Epoch [2/500], Train Loss: 0.5603
Epoch [3/500], Train Loss: 0.5434
Epoch [4/500], Train Loss: 0.5298
Epoch [5/500], Train Loss: 0.5181
Epoch [6/500], Train Loss: 0.5089
Epoch [7/500], Train Loss: 0.5002
Epoch [8/500], Train Loss: 0.4922
Epoch [9/500], Train Loss: 0.4848
Epoch [10/500], Train Loss: 0.4786
Epoch [11/500], Train Loss: 0.4722
Epoch [12/500], Train Loss: 0.4665
Epoch [13/500], Train Loss: 0.4609
Epoch [14/500], Train Loss: 0.4560
Epoch [15/500], Train Loss: 0.4508
Epoch [16/500], Train Loss: 0.4457
Epoch [17/500], Train Loss: 0.4414
Epoch [18/500], Train Loss: 0.4371
Epoch [19/500], Train Loss: 0.4328
Epoch [20/500], Train Loss: 0.4288
Epoch [21/500], Train Loss: 0.4244
Epoch [22/500], Train Loss: 0.4206
Epoch [23/500], Train Loss: 0.4171
Epoch [24/500], Train Loss: 0.4133
E

[I 2024-04-30 10:15:05,927] Trial 51 finished with value: 0.2571044263931421 and parameters: {'learning_rate': 0.0010547016944786214, 'l1_coef': 2.3292680367119465e-05, 'patience': 10, 'batch_size': 32}. Best is trial 17 with value: 0.11601511778739784.


Chr 8 - Best hyperparameters: {'learning_rate': 0.0012197976599892346, 'l1_coef': 1.0397855148759009e-05, 'patience': 6, 'batch_size': 32}
Chr 8 - Best value: 0.1160
Epoch [1/500], Train Loss: 0.5153
Epoch [2/500], Train Loss: 0.4592
Epoch [3/500], Train Loss: 0.4331
Epoch [4/500], Train Loss: 0.4125
Epoch [5/500], Train Loss: 0.3956
Epoch [6/500], Train Loss: 0.3817
Epoch [7/500], Train Loss: 0.3689
Epoch [8/500], Train Loss: 0.3582
Epoch [9/500], Train Loss: 0.3496
Epoch [10/500], Train Loss: 0.3410
Epoch [11/500], Train Loss: 0.3344
Epoch [12/500], Train Loss: 0.3277
Epoch [13/500], Train Loss: 0.3214
Epoch [14/500], Train Loss: 0.3159
Epoch [15/500], Train Loss: 0.3114
Epoch [16/500], Train Loss: 0.3065
Epoch [17/500], Train Loss: 0.3019
Epoch [18/500], Train Loss: 0.2979
Epoch [19/500], Train Loss: 0.2947
Epoch [20/500], Train Loss: 0.2915
Epoch [21/500], Train Loss: 0.2884
Epoch [22/500], Train Loss: 0.2857
Epoch [23/500], Train Loss: 0.2819
Epoch [24/500], Train Loss: 0.2796
Epo

[I 2024-04-30 10:15:12,925] Trial 51 finished with value: 0.26292283878876616 and parameters: {'learning_rate': 0.022038030748484788, 'l1_coef': 1.6185418086062093e-05, 'patience': 18, 'batch_size': 32}. Best is trial 21 with value: 0.11073745116591453.


Chr 9 - Best hyperparameters: {'learning_rate': 0.05822594013574142, 'l1_coef': 1.5307383394547932e-05, 'patience': 14, 'batch_size': 256}
Chr 9 - Best value: 0.1107
Epoch [1/500], Train Loss: 1.6723
Epoch [2/500], Train Loss: 1.0683
Epoch [3/500], Train Loss: 0.8159
Epoch [4/500], Train Loss: 0.6909
Epoch [5/500], Train Loss: 0.6291
Epoch [6/500], Train Loss: 0.5992
Epoch [7/500], Train Loss: 0.5878
Epoch [8/500], Train Loss: 0.5733
Epoch [9/500], Train Loss: 0.5676
Epoch [10/500], Train Loss: 0.5642
Epoch [11/500], Train Loss: 0.5599
Epoch [12/500], Train Loss: 0.5592
Epoch [13/500], Train Loss: 0.5513
Epoch [14/500], Train Loss: 0.5481
Epoch [15/500], Train Loss: 0.5450
Epoch [16/500], Train Loss: 0.5464
Epoch [17/500], Train Loss: 0.5419
Epoch [18/500], Train Loss: 0.5405
Epoch [19/500], Train Loss: 0.5413
Epoch [20/500], Train Loss: 0.5364
Epoch [21/500], Train Loss: 0.5393
Epoch [22/500], Train Loss: 0.5392
Epoch [23/500], Train Loss: 0.5354
Epoch [24/500], Train Loss: 0.5359
Epo

[I 2024-04-30 10:15:22,097] Trial 51 finished with value: 0.20796320821557726 and parameters: {'learning_rate': 0.0001569302693108911, 'l1_coef': 1.6432141602439407e-05, 'patience': 12, 'batch_size': 64}. Best is trial 31 with value: 0.10043132305145264.


Chr 10 - Best hyperparameters: {'learning_rate': 0.00022093387007935644, 'l1_coef': 1.0430280711765954e-05, 'patience': 5, 'batch_size': 32}
Chr 10 - Best value: 0.1004
Epoch [1/500], Train Loss: 0.5688
Epoch [2/500], Train Loss: 0.4924
Epoch [3/500], Train Loss: 0.4696
Epoch [4/500], Train Loss: 0.4545
Epoch [5/500], Train Loss: 0.4430
Epoch [6/500], Train Loss: 0.4335
Epoch [7/500], Train Loss: 0.4253
Epoch [8/500], Train Loss: 0.4180
Epoch [9/500], Train Loss: 0.4115
Epoch [10/500], Train Loss: 0.4054
Epoch [11/500], Train Loss: 0.3994
Epoch [12/500], Train Loss: 0.3937
Epoch [13/500], Train Loss: 0.3883
Epoch [14/500], Train Loss: 0.3834
Epoch [15/500], Train Loss: 0.3784
Epoch [16/500], Train Loss: 0.3740
Epoch [17/500], Train Loss: 0.3693
Epoch [18/500], Train Loss: 0.3650
Epoch [19/500], Train Loss: 0.3611
Epoch [20/500], Train Loss: 0.3570
Epoch [21/500], Train Loss: 0.3531
Epoch [22/500], Train Loss: 0.3493
Epoch [23/500], Train Loss: 0.3455
Epoch [24/500], Train Loss: 0.3425


[I 2024-04-30 10:15:36,250] Trial 51 finished with value: 0.08456481844186783 and parameters: {'learning_rate': 0.010028189689892582, 'l1_coef': 1.0142119414700108e-05, 'patience': 5, 'batch_size': 256}. Best is trial 23 with value: 0.05782480022081963.


Chr 11 - Best hyperparameters: {'learning_rate': 0.0002329404200995808, 'l1_coef': 1.2601501185021566e-05, 'patience': 9, 'batch_size': 32}
Chr 11 - Best value: 0.0578
Epoch [1/500], Train Loss: 0.5187
Epoch [2/500], Train Loss: 0.4539
Epoch [3/500], Train Loss: 0.4337
Epoch [4/500], Train Loss: 0.4192
Epoch [5/500], Train Loss: 0.4074
Epoch [6/500], Train Loss: 0.3965
Epoch [7/500], Train Loss: 0.3867
Epoch [8/500], Train Loss: 0.3774
Epoch [9/500], Train Loss: 0.3688
Epoch [10/500], Train Loss: 0.3608
Epoch [11/500], Train Loss: 0.3533
Epoch [12/500], Train Loss: 0.3460
Epoch [13/500], Train Loss: 0.3390
Epoch [14/500], Train Loss: 0.3327
Epoch [15/500], Train Loss: 0.3264
Epoch [16/500], Train Loss: 0.3208
Epoch [17/500], Train Loss: 0.3150
Epoch [18/500], Train Loss: 0.3096
Epoch [19/500], Train Loss: 0.3045
Epoch [20/500], Train Loss: 0.2996
Epoch [21/500], Train Loss: 0.2947
Epoch [22/500], Train Loss: 0.2900
Epoch [23/500], Train Loss: 0.2858
Epoch [24/500], Train Loss: 0.2812
E

[I 2024-04-30 10:15:54,609] Trial 51 finished with value: 0.5704960245352525 and parameters: {'learning_rate': 0.01954404097687797, 'l1_coef': 0.07575838536636692, 'patience': 10, 'batch_size': 32}. Best is trial 29 with value: 0.05914152525365353.


Chr 12 - Best hyperparameters: {'learning_rate': 0.013368341073967381, 'l1_coef': 1.0168876770412278e-05, 'patience': 9, 'batch_size': 128}
Chr 12 - Best value: 0.0591
Epoch [1/500], Train Loss: 0.5868
Epoch [2/500], Train Loss: 0.4398
Epoch [3/500], Train Loss: 0.3777
Epoch [4/500], Train Loss: 0.3438
Epoch [5/500], Train Loss: 0.3232
Epoch [6/500], Train Loss: 0.3042
Epoch [7/500], Train Loss: 0.2893
Epoch [8/500], Train Loss: 0.2795
Epoch [9/500], Train Loss: 0.2699
Epoch [10/500], Train Loss: 0.2642
Epoch [11/500], Train Loss: 0.2541
Epoch [12/500], Train Loss: 0.2484
Epoch [13/500], Train Loss: 0.2422
Epoch [14/500], Train Loss: 0.2385
Epoch [15/500], Train Loss: 0.2337
Epoch [16/500], Train Loss: 0.2293
Epoch [17/500], Train Loss: 0.2281
Epoch [18/500], Train Loss: 0.2246
Epoch [19/500], Train Loss: 0.2214
Epoch [20/500], Train Loss: 0.2177
Epoch [21/500], Train Loss: 0.2155
Epoch [22/500], Train Loss: 0.2126
Epoch [23/500], Train Loss: 0.2117
Epoch [24/500], Train Loss: 0.2096
E

[I 2024-04-30 10:15:57,691] Trial 51 finished with value: 0.10817819599594389 and parameters: {'learning_rate': 0.037741773677323495, 'l1_coef': 1.7889572827328185e-05, 'patience': 12, 'batch_size': 64}. Best is trial 51 with value: 0.10817819599594389.


Chr 13 - Best hyperparameters: {'learning_rate': 0.037741773677323495, 'l1_coef': 1.7889572827328185e-05, 'patience': 12, 'batch_size': 64}
Chr 13 - Best value: 0.1082
Epoch [1/500], Train Loss: 0.2751
Epoch [2/500], Train Loss: 0.1735
Epoch [3/500], Train Loss: 0.1582
Epoch [4/500], Train Loss: 0.1513
Epoch [5/500], Train Loss: 0.1365
Epoch [6/500], Train Loss: 0.1307
Epoch [7/500], Train Loss: 0.1266
Epoch [8/500], Train Loss: 0.1233
Epoch [9/500], Train Loss: 0.1209
Epoch [10/500], Train Loss: 0.1228
Epoch [11/500], Train Loss: 0.1212
Epoch [12/500], Train Loss: 0.1159
Epoch [13/500], Train Loss: 0.1232
Epoch [14/500], Train Loss: 0.1184
Epoch [15/500], Train Loss: 0.1159
Epoch [16/500], Train Loss: 0.1152
Epoch [17/500], Train Loss: 0.1141
Epoch [18/500], Train Loss: 0.1142
Epoch [19/500], Train Loss: 0.1187
Epoch [20/500], Train Loss: 0.1180
Epoch [21/500], Train Loss: 0.1137
Epoch [22/500], Train Loss: 0.1121
Epoch [23/500], Train Loss: 0.1126
Epoch [24/500], Train Loss: 0.1136
E

[I 2024-04-30 10:15:59,973] Trial 51 finished with value: 0.2314489848911762 and parameters: {'learning_rate': 0.09224445915934029, 'l1_coef': 1.4485510348257023e-05, 'patience': 18, 'batch_size': 128}. Best is trial 36 with value: 0.07342382765242031.


Chr 14 - Best hyperparameters: {'learning_rate': 0.04900486554871739, 'l1_coef': 1.160024455652371e-05, 'patience': 20, 'batch_size': 64}
Chr 14 - Best value: 0.0734
Epoch [1/500], Train Loss: 0.4748
Epoch [2/500], Train Loss: 0.3071
Epoch [3/500], Train Loss: 0.2859
Epoch [4/500], Train Loss: 0.2684
Epoch [5/500], Train Loss: 0.2610
Epoch [6/500], Train Loss: 0.2629
Epoch [7/500], Train Loss: 0.2560
Epoch [8/500], Train Loss: 0.2585
Epoch [9/500], Train Loss: 0.2524
Epoch [10/500], Train Loss: 0.2487
Epoch [11/500], Train Loss: 0.2505
Epoch [12/500], Train Loss: 0.2492
Epoch [13/500], Train Loss: 0.2445
Epoch [14/500], Train Loss: 0.2521
Epoch [15/500], Train Loss: 0.2490
Epoch [16/500], Train Loss: 0.2439
Epoch [17/500], Train Loss: 0.2529
Epoch [18/500], Train Loss: 0.2416
Epoch [19/500], Train Loss: 0.2475
Epoch [20/500], Train Loss: 0.2469
Epoch [21/500], Train Loss: 0.2424
Epoch [22/500], Train Loss: 0.2506
Epoch [23/500], Train Loss: 0.2476
Epoch [24/500], Train Loss: 0.2396
Epo

[I 2024-04-30 10:16:02,921] Trial 51 finished with value: 0.17873024745629382 and parameters: {'learning_rate': 0.005160721839857431, 'l1_coef': 5.685922594991786e-05, 'patience': 9, 'batch_size': 32}. Best is trial 7 with value: 0.07289292319462849.


Chr 15 - Best hyperparameters: {'learning_rate': 0.00031046985754389273, 'l1_coef': 1.7067190785696914e-05, 'patience': 5, 'batch_size': 32}
Chr 15 - Best value: 0.0729
Epoch [1/500], Train Loss: 0.5827
Epoch [2/500], Train Loss: 0.4825
Epoch [3/500], Train Loss: 0.4524
Epoch [4/500], Train Loss: 0.4352
Epoch [5/500], Train Loss: 0.4220
Epoch [6/500], Train Loss: 0.4113
Epoch [7/500], Train Loss: 0.4029
Epoch [8/500], Train Loss: 0.3940
Epoch [9/500], Train Loss: 0.3864
Epoch [10/500], Train Loss: 0.3789
Epoch [11/500], Train Loss: 0.3723
Epoch [12/500], Train Loss: 0.3661
Epoch [13/500], Train Loss: 0.3604
Epoch [14/500], Train Loss: 0.3548
Epoch [15/500], Train Loss: 0.3492
Epoch [16/500], Train Loss: 0.3446
Epoch [17/500], Train Loss: 0.3401
Epoch [18/500], Train Loss: 0.3354
Epoch [19/500], Train Loss: 0.3307
Epoch [20/500], Train Loss: 0.3272
Epoch [21/500], Train Loss: 0.3227
Epoch [22/500], Train Loss: 0.3190
Epoch [23/500], Train Loss: 0.3151
Epoch [24/500], Train Loss: 0.3122


[I 2024-04-30 10:16:10,928] Trial 51 finished with value: 0.1415694100516183 and parameters: {'learning_rate': 0.029438412179662383, 'l1_coef': 1.6960842966286154e-05, 'patience': 9, 'batch_size': 64}. Best is trial 19 with value: 0.08846894302047217.


Chr 16 - Best hyperparameters: {'learning_rate': 0.0003280120292913132, 'l1_coef': 1.0215158769106902e-05, 'patience': 14, 'batch_size': 32}
Chr 16 - Best value: 0.0885
Epoch [1/500], Train Loss: 0.5473
Epoch [2/500], Train Loss: 0.4663
Epoch [3/500], Train Loss: 0.4454
Epoch [4/500], Train Loss: 0.4300
Epoch [5/500], Train Loss: 0.4172
Epoch [6/500], Train Loss: 0.4063
Epoch [7/500], Train Loss: 0.3964
Epoch [8/500], Train Loss: 0.3875
Epoch [9/500], Train Loss: 0.3792
Epoch [10/500], Train Loss: 0.3717
Epoch [11/500], Train Loss: 0.3644
Epoch [12/500], Train Loss: 0.3576
Epoch [13/500], Train Loss: 0.3511
Epoch [14/500], Train Loss: 0.3451
Epoch [15/500], Train Loss: 0.3396
Epoch [16/500], Train Loss: 0.3335
Epoch [17/500], Train Loss: 0.3285
Epoch [18/500], Train Loss: 0.3234
Epoch [19/500], Train Loss: 0.3188
Epoch [20/500], Train Loss: 0.3143
Epoch [21/500], Train Loss: 0.3098
Epoch [22/500], Train Loss: 0.3055
Epoch [23/500], Train Loss: 0.3014
Epoch [24/500], Train Loss: 0.2979


[I 2024-04-30 10:16:22,459] Trial 51 finished with value: 0.1804597333073616 and parameters: {'learning_rate': 0.011811863607846414, 'l1_coef': 1.6778229036348106e-05, 'patience': 12, 'batch_size': 32}. Best is trial 25 with value: 0.03733763907636915.


Chr 17 - Best hyperparameters: {'learning_rate': 0.01875699494745296, 'l1_coef': 1.0107606908789018e-05, 'patience': 16, 'batch_size': 64}
Chr 17 - Best value: 0.0373
Epoch [1/500], Train Loss: 0.3377
Epoch [2/500], Train Loss: 0.2443
Epoch [3/500], Train Loss: 0.2226
Epoch [4/500], Train Loss: 0.2117
Epoch [5/500], Train Loss: 0.2055
Epoch [6/500], Train Loss: 0.1992
Epoch [7/500], Train Loss: 0.1962
Epoch [8/500], Train Loss: 0.1924
Epoch [9/500], Train Loss: 0.1888
Epoch [10/500], Train Loss: 0.1890
Epoch [11/500], Train Loss: 0.1890
Epoch [12/500], Train Loss: 0.1866
Epoch [13/500], Train Loss: 0.1858
Epoch [14/500], Train Loss: 0.1839
Epoch [15/500], Train Loss: 0.1860
Epoch [16/500], Train Loss: 0.1824
Epoch [17/500], Train Loss: 0.1837
Epoch [18/500], Train Loss: 0.1816
Epoch [19/500], Train Loss: 0.1815
Epoch [20/500], Train Loss: 0.1821
Epoch [21/500], Train Loss: 0.1816
Epoch [22/500], Train Loss: 0.1780
Epoch [23/500], Train Loss: 0.1783
Epoch [24/500], Train Loss: 0.1773
Ep

[I 2024-04-30 10:16:25,521] Trial 51 finished with value: 0.17057546686667663 and parameters: {'learning_rate': 0.06684500720555978, 'l1_coef': 1.9180233950342432e-05, 'patience': 16, 'batch_size': 32}. Best is trial 29 with value: 0.08387428413216884.


Chr 18 - Best hyperparameters: {'learning_rate': 0.06940442887866152, 'l1_coef': 2.1499468833674833e-05, 'patience': 17, 'batch_size': 32}
Chr 18 - Best value: 0.0839
Epoch [1/500], Train Loss: 0.8155
Epoch [2/500], Train Loss: 0.2518
Epoch [3/500], Train Loss: 0.2495
Epoch [4/500], Train Loss: 0.2272
Epoch [5/500], Train Loss: 0.2233
Epoch [6/500], Train Loss: 0.2243
Epoch [7/500], Train Loss: 0.2282
Epoch [8/500], Train Loss: 0.2406
Epoch [9/500], Train Loss: 0.2411
Epoch [10/500], Train Loss: 0.2508
Epoch [11/500], Train Loss: 0.2332
Epoch [12/500], Train Loss: 0.1764
Epoch [13/500], Train Loss: 0.1663
Epoch [14/500], Train Loss: 0.1625
Epoch [15/500], Train Loss: 0.1635
Epoch [16/500], Train Loss: 0.1604
Epoch [17/500], Train Loss: 0.1602
Epoch [18/500], Train Loss: 0.1590
Epoch [19/500], Train Loss: 0.1590
Epoch [20/500], Train Loss: 0.1595
Epoch [21/500], Train Loss: 0.1592
Epoch [22/500], Train Loss: 0.1583
Epoch [23/500], Train Loss: 0.1603
Epoch [24/500], Train Loss: 0.1573
Ep

[I 2024-04-30 10:16:28,992] Trial 51 finished with value: 0.05365990317211702 and parameters: {'learning_rate': 0.0541193007360449, 'l1_coef': 1.6601253217205744e-05, 'patience': 12, 'batch_size': 32}. Best is trial 27 with value: 0.04425642937421799.


Chr 19 - Best hyperparameters: {'learning_rate': 0.07098016523175216, 'l1_coef': 1.0298921090840162e-05, 'patience': 11, 'batch_size': 128}
Chr 19 - Best value: 0.0443
Epoch [1/500], Train Loss: 0.7952
Epoch [2/500], Train Loss: 0.2552
Epoch [3/500], Train Loss: 0.1461
Epoch [4/500], Train Loss: 0.1086
Epoch [5/500], Train Loss: 0.0903
Epoch [6/500], Train Loss: 0.0807
Epoch [7/500], Train Loss: 0.0741
Epoch [8/500], Train Loss: 0.0708
Epoch [9/500], Train Loss: 0.0670
Epoch [10/500], Train Loss: 0.0639
Epoch [11/500], Train Loss: 0.0611
Epoch [12/500], Train Loss: 0.0579
Epoch [13/500], Train Loss: 0.0558
Epoch [14/500], Train Loss: 0.0560
Epoch [15/500], Train Loss: 0.0539
Epoch [16/500], Train Loss: 0.0540
Epoch [17/500], Train Loss: 0.0525
Epoch [18/500], Train Loss: 0.0517
Epoch [19/500], Train Loss: 0.0524
Epoch [20/500], Train Loss: 0.0489
Epoch [21/500], Train Loss: 0.0474
Epoch [22/500], Train Loss: 0.0459
Epoch [23/500], Train Loss: 0.0494
Epoch [24/500], Train Loss: 0.0459
E

[I 2024-04-30 10:16:31,392] Trial 51 finished with value: 0.16642819259847913 and parameters: {'learning_rate': 0.059122094780524326, 'l1_coef': 2.4162510707354773e-05, 'patience': 13, 'batch_size': 64}. Best is trial 23 with value: 0.11483677476644516.


Chr 20 - Best hyperparameters: {'learning_rate': 0.011727770284572503, 'l1_coef': 1.0588262176994207e-05, 'patience': 9, 'batch_size': 128}
Chr 20 - Best value: 0.1148
Epoch [1/500], Train Loss: 0.3474
Epoch [2/500], Train Loss: 0.2138
Epoch [3/500], Train Loss: 0.2090
Epoch [4/500], Train Loss: 0.2010
Epoch [5/500], Train Loss: 0.1963
Epoch [6/500], Train Loss: 0.1947
Epoch [7/500], Train Loss: 0.1913
Epoch [8/500], Train Loss: 0.1887
Epoch [9/500], Train Loss: 0.1870
Epoch [10/500], Train Loss: 0.1861
Epoch [11/500], Train Loss: 0.1843
Epoch [12/500], Train Loss: 0.1822
Epoch [13/500], Train Loss: 0.1826
Epoch [14/500], Train Loss: 0.1808
Epoch [15/500], Train Loss: 0.1802
Epoch [16/500], Train Loss: 0.1789
Epoch [17/500], Train Loss: 0.1780
Epoch [18/500], Train Loss: 0.1777
Epoch [19/500], Train Loss: 0.1777
Epoch [20/500], Train Loss: 0.1761
Epoch [21/500], Train Loss: 0.1748
Epoch [22/500], Train Loss: 0.1748
Epoch [23/500], Train Loss: 0.1744
Epoch [24/500], Train Loss: 0.1752
E

[I 2024-04-30 10:16:35,114] Trial 51 finished with value: 0.20866690851174868 and parameters: {'learning_rate': 0.00425629587460915, 'l1_coef': 2.5980568836531632e-05, 'patience': 19, 'batch_size': 32}. Best is trial 27 with value: 0.0400828163006476.


Chr 21 - Best hyperparameters: {'learning_rate': 0.07634654140233994, 'l1_coef': 1.331717466479133e-05, 'patience': 19, 'batch_size': 64}
Chr 21 - Best value: 0.0401
Epoch [1/500], Train Loss: 0.4141
Epoch [2/500], Train Loss: 0.2711
Epoch [3/500], Train Loss: 0.2404
Epoch [4/500], Train Loss: 0.2337
Epoch [5/500], Train Loss: 0.2256
Epoch [6/500], Train Loss: 0.2236
Epoch [7/500], Train Loss: 0.2190
Epoch [8/500], Train Loss: 0.2169
Epoch [9/500], Train Loss: 0.2185
Epoch [10/500], Train Loss: 0.2199
Epoch [11/500], Train Loss: 0.2182
Epoch [12/500], Train Loss: 0.2169
Epoch [13/500], Train Loss: 0.2207
Epoch [14/500], Train Loss: 0.2147
Epoch [15/500], Train Loss: 0.2141
Epoch [16/500], Train Loss: 0.2201
Epoch [17/500], Train Loss: 0.2209
Epoch [18/500], Train Loss: 0.2192
Epoch [19/500], Train Loss: 0.2154
Epoch [20/500], Train Loss: 0.2151
Epoch [21/500], Train Loss: 0.2159
Epoch [22/500], Train Loss: 0.2031
Epoch [23/500], Train Loss: 0.2027
Epoch [24/500], Train Loss: 0.2028
Epo

[I 2024-04-30 10:16:37,198] Trial 51 finished with value: 0.20031897425651551 and parameters: {'learning_rate': 0.027286468996185014, 'l1_coef': 1.610250127616247e-05, 'patience': 10, 'batch_size': 64}. Best is trial 40 with value: 0.06794291579952608.


Chr 22 - Best hyperparameters: {'learning_rate': 0.02943043957881216, 'l1_coef': 1.0232605917875621e-05, 'patience': 12, 'batch_size': 32}
Chr 22 - Best value: 0.0679
Epoch [1/500], Train Loss: 0.3877
Epoch [2/500], Train Loss: 0.2737
Epoch [3/500], Train Loss: 0.2511
Epoch [4/500], Train Loss: 0.2488
Epoch [5/500], Train Loss: 0.2414
Epoch [6/500], Train Loss: 0.2249
Epoch [7/500], Train Loss: 0.2303
Epoch [8/500], Train Loss: 0.2174
Epoch [9/500], Train Loss: 0.2177
Epoch [10/500], Train Loss: 0.2222
Epoch [11/500], Train Loss: 0.2170
Epoch [12/500], Train Loss: 0.2150
Epoch [13/500], Train Loss: 0.2105
Epoch [14/500], Train Loss: 0.2217
Epoch [15/500], Train Loss: 0.2207
Epoch [16/500], Train Loss: 0.2137
Epoch [17/500], Train Loss: 0.2130
Epoch [18/500], Train Loss: 0.2163
Epoch [19/500], Train Loss: 0.2125
Epoch [20/500], Train Loss: 0.1824
Epoch [21/500], Train Loss: 0.1783
Epoch [22/500], Train Loss: 0.1776
Epoch [23/500], Train Loss: 0.1772
Epoch [24/500], Train Loss: 0.1764
Ep

  iqs = (po - pc) / (1 - pc)


Individual AUC ROC curves saved in: ../../Data/model_results_unphased_all_PRS/logistic_regression/roc_curves/
Performance metrics saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/performance_metrics.csv


<Figure size 640x480 with 0 Axes>

In [8]:
# Loop through all the training datasets and document the PRS313 SNPs in each dataset. Save this to a CSV file.

import pandas as pd
import os

data_directory = '../../Data/Filtered_unphased_training_data/'
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize a list to store the PRS313 SNPs in each dataset
prs313_snps = []

for chromosome_number in range(1, 23):
    file_name = data_directory + \
        f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    prs313_snps.append(data.filter(regex='PRS313_').columns)

# Create a DataFrame to store the PRS313 SNPs in each dataset
prs313_df = pd.DataFrame({
    'Chromosome': list(range(1, 23)),
    'PRS313 SNPs': prs313_snps,
    "Number of PRS313 SNPs": [len(snps) for snps in prs313_snps]
})

# Save the PRS313 SNPs to a CSV file
prs313_csv_file = output_folder + 'prs313_snps.csv'
prs313_df.to_csv(prs313_csv_file, index=False)
print(f"PRS313 SNPs saved at: {prs313_csv_file}")

# Print the total number of PRS313 SNPs in all datasets
total_prs313_snps = sum(prs313_df["Number of PRS313 SNPs"])
print(f"Total number of PRS313 SNPs: {total_prs313_snps}")



PRS313 SNPs saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/prs313_snps.csv
Total number of PRS313 SNPs: 314


In [9]:
prs313_df["Number of PRS313 SNPs"]

0     30
1     21
2     16
3     11
4     34
5     20
6     14
7     21
8     14
9     18
10    19
11    17
12     5
13     8
14     7
15    14
16     9
17     9
18     7
19     4
20     5
21    11
Name: Number of PRS313 SNPs, dtype: int64