# Logistic Regression Model

In [2]:
import numpy as np

def calculate_iqs(true_genotypes, imputed_dosages):
    # Convert imputed dosages to discrete values
    imputed_discrete = np.round(imputed_dosages).astype(int)

    # Clip the imputed discrete values to be within the range of 0 to 2
    imputed_discrete = np.clip(imputed_discrete, 0, 2)

    # Create a contingency table
    contingency_table = np.zeros((3, 3), dtype=int)

    # Fill the contingency table
    for true_geno, imputed_geno in zip(true_genotypes, imputed_discrete):
        for true_allele, imputed_allele in zip(true_geno, imputed_geno):
            contingency_table[int(true_allele), int(imputed_allele)] += 1

    # Calculate the total number of genotypes
    total_genotypes = np.sum(contingency_table)

    # Calculate observed proportion of agreement (Po)
    observed_agreement = np.trace(contingency_table) / total_genotypes

    # Calculate marginal sums
    row_marginals = np.sum(contingency_table, axis=1)
    col_marginals = np.sum(contingency_table, axis=0)

    # Calculate chance agreement (Pc)
    chance_agreement = np.sum((row_marginals * col_marginals) / (total_genotypes ** 2))

    # Calculate IQS
    if chance_agreement == 1:  # To prevent division by zero in case of perfect chance agreement
        iqs_score = 0
    else:
        iqs_score = (observed_agreement - chance_agreement) / (1 - chance_agreement)

    return iqs_score

# Example usage:
true_genotypes = np.array([[0, 1, 2], [1, 2, 0], [2, 0, 1]])
imputed_dosages = np.array([[0.1, 1.2, 1.9], [1.0, 1.8, 0.3], [2.0, 0.5, 1.4]])

iqs_score = calculate_iqs(true_genotypes, imputed_dosages)
print(f"IQS Score: {iqs_score}")


IQS Score: 1.0


In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.metrics import r2_score as sklearn_r2_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../../Data/Filtered_split_training_data/'
start = 6

# Initialize lists to store the performance metrics for each chromosome
accuracies = []
precisions = []
recalls = []
false_positive_rates = []
auc_rocs = []
r2_scores = []
iqs_scores = []

# Create folders for saving files
output_folder = "../../../Data/model_results/logistic_regression/"
model_folder = output_folder + "models/"
csv_folder = output_folder + "csv_files/"
curve_folder = output_folder + "roc_curves/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)
os.makedirs(curve_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"
    chr_curve_folder = curve_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)
    os.makedirs(chr_curve_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_split.parquet"
    data = pd.read_parquet(file_name)


    # # Split the data into features and target
    # X = torch.tensor(data.filter(regex='^(?!.*Unknown)').values, dtype=torch.float32)
    # y = torch.tensor(data.filter(regex='Unknown').values, dtype=torch.float32)

    # print("Unknown PRS313 SNPs: ", y.shape[1])
    # print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
    # print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])
    # print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])


    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the logistic regression model with lasso regularization
    class LogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.sigmoid = nn.Sigmoid()
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            out = self.sigmoid(out)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)
        
    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500
    batch_size = 128

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-5, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])

        model = LogisticRegression(input_dim, output_dim, l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.BCELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_val, y_train_val.argmax(dim=1))):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs, batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        # print(f"Early stopping at epoch {epoch+1}")
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)

    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"chr{chromosome_number}_study"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists

    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")
    
    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)

    study.optimize(objective, n_trials=10, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']

    model = LogisticRegression(input_dim, output_dim, best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.BCELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}")

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_preds = (test_outputs > 0.5).float()
        test_accuracy = float(((test_preds > 0.5) == y_test).float().mean())
        test_precision = precision_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_recall = recall_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_f1 = f1_score(y_test.cpu().numpy(), test_preds.cpu().numpy(), average='micro')
        test_roc_auc = roc_auc_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), average='micro')
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs(y_test.cpu().numpy(), test_outputs.cpu().numpy())

        # Calculate false positive rate
        cm = confusion_matrix(y_test.cpu().numpy().ravel(), test_preds.cpu().numpy().ravel())
        tn, fp, fn, tp = cm.ravel()
        test_fpr = fp / (fp + tn)

        # Append performance metrics to the lists
        accuracies.append(test_accuracy)
        precisions.append(test_precision)
        recalls.append(test_recall)
        false_positive_rates.append(test_fpr)
        auc_rocs.append(test_roc_auc)
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='Unknown').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

        # Save individual AUC ROC curves for each SNP
        for i, snp in enumerate(snp_names):
            try: 
                fpr, tpr, _ = roc_curve(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i])
                plt.figure()
                plt.plot(fpr, tpr, label=f'AUC ROC = {roc_auc_score(y_test.cpu().numpy()[:, i], test_outputs.cpu().numpy()[:, i]):.4f}')
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title(f'AUC ROC Curve - {snp}')
                plt.legend()
                
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chromosome_number}.png'
                plt.savefig(curve_file)
                plt.close()
            except ValueError:
                # Save a placeholder image if there is insufficient data
                plt.figure()
                plt.axis('off')
                plt.text(0.5, 0.5, "Insufficient data for ROC curve", ha='center', va='center')
                curve_file = chr_curve_folder + f'auc_roc_curve_{snp}_chr{chrom0osome_number}.png'
                plt.savefig(curve_file)
                plt.close()

                print(f"Skipping SNP {snp} due to insufficient data")


        print(f"Individual AUC ROC curves saved in: {curve_folder}")

        # Create a DataFrame to store the performance metrics for each chromosome
        performance_df = pd.DataFrame({
            'Chromosome': list(range(start, chromosome_number + 1)),
            'Accuracy': accuracies,
            'Precision': precisions,
            'Recall': recalls,
            'False Positive Rate': false_positive_rates,
            'AUC ROC': auc_rocs,
            'R2 Score': r2_scores,
            'IQS Score': iqs_scores
        })

        # Save the performance metrics to a CSV file
        performance_csv_file = csv_folder + 'performance_metrics.csv'
        performance_df.to_csv(performance_csv_file, index=False)
        print(f"Performance metrics saved at: {performance_csv_file}")

Total SNPs:  1714
PRS313 SNPs:  40
Total SNPs used for Training:  1674


[I 2024-05-30 13:49:12,711] Trial 10 finished with value: 0.22871639476372643 and parameters: {'learning_rate': 0.009375212909338682, 'l1_coef': 4.393958333523146e-05, 'patience': 5, 'batch_size': 32}. Best is trial 0 with value: 0.17545781341882855.
[I 2024-05-30 13:49:51,343] Trial 4 finished with value: 0.552144455909729 and parameters: {'learning_rate': 0.010779792685448014, 'l1_coef': 0.04157067353646598, 'patience': 13, 'batch_size': 256}. Best is trial 0 with value: 0.17545781341882855.
[I 2024-05-30 13:49:56,615] Trial 6 finished with value: 0.552363109588623 and parameters: {'learning_rate': 0.002239794865613517, 'l1_coef': 0.004876956611674172, 'patience': 20, 'batch_size': 256}. Best is trial 0 with value: 0.17545781341882855.
[I 2024-05-30 13:49:58,433] Trial 1 finished with value: 0.5473843425512314 and parameters: {'learning_rate': 0.03835060051676701, 'l1_coef': 0.07468179533000352, 'patience': 19, 'batch_size': 128}. Best is trial 0 with value: 0.17545781341882855.
[I 2

Chr 6 - Best hyperparameters: {'learning_rate': 0.08707712544870178, 'l1_coef': 3.1310975887339915e-05, 'patience': 15, 'batch_size': 32}
Chr 6 - Best value: 0.1576
Epoch [1/500], Train Loss: 21.1955
Epoch [2/500], Train Loss: 14.4986
Epoch [3/500], Train Loss: 13.1855
Epoch [4/500], Train Loss: 12.6010
Epoch [5/500], Train Loss: 9.6711
Epoch [6/500], Train Loss: 8.9244
Epoch [7/500], Train Loss: 8.4733
Epoch [8/500], Train Loss: 9.2435
Epoch [9/500], Train Loss: 10.3568
Epoch [10/500], Train Loss: 9.0054
Epoch [11/500], Train Loss: 8.3581
Epoch [12/500], Train Loss: 8.6895
Epoch [13/500], Train Loss: 8.5220
Epoch [14/500], Train Loss: 8.4626
Epoch [15/500], Train Loss: 7.5776
Epoch [16/500], Train Loss: 9.1982
Epoch [17/500], Train Loss: 8.2002
Epoch [18/500], Train Loss: 7.6238
Epoch [19/500], Train Loss: 6.3554
Epoch [20/500], Train Loss: 5.7224
Epoch [21/500], Train Loss: 5.9113
Epoch [22/500], Train Loss: 6.9852
Epoch [23/500], Train Loss: 7.4490
Epoch [24/500], Train Loss: 6.5283

[I 2024-05-30 13:51:12,199] A new study created in RDB with name: chr7_study
[I 2024-05-30 13:52:00,389] Trial 1 finished with value: 0.45895434767007826 and parameters: {'learning_rate': 0.06789925410251739, 'l1_coef': 0.003303112958291555, 'patience': 10, 'batch_size': 128}. Best is trial 1 with value: 0.45895434767007826.
[I 2024-05-30 13:52:06,570] Trial 7 finished with value: 4.659018802642822 and parameters: {'learning_rate': 0.07885376895907296, 'l1_coef': 2.875412588146054e-05, 'patience': 20, 'batch_size': 64}. Best is trial 1 with value: 0.45895434767007826.
[I 2024-05-30 13:52:10,194] Trial 3 finished with value: 0.12991864010691642 and parameters: {'learning_rate': 0.019064635865524475, 'l1_coef': 5.3120990680199706e-05, 'patience': 15, 'batch_size': 128}. Best is trial 3 with value: 0.12991864010691642.
[I 2024-05-30 13:52:11,318] Trial 4 finished with value: 0.5769830822944642 and parameters: {'learning_rate': 0.0016518075391926082, 'l1_coef': 0.03236144996571162, 'patien

Chr 7 - Best hyperparameters: {'learning_rate': 0.04237722490218332, 'l1_coef': 5.949874007250977e-05, 'patience': 8, 'batch_size': 32}
Chr 7 - Best value: 0.1294
Epoch [1/500], Train Loss: 0.9855
Epoch [2/500], Train Loss: 0.3288
Epoch [3/500], Train Loss: 0.2752
Epoch [4/500], Train Loss: 0.2819
Epoch [5/500], Train Loss: 0.2769
Epoch [6/500], Train Loss: 0.2739
Epoch [7/500], Train Loss: 0.2853
Epoch [8/500], Train Loss: 0.2996
Epoch [9/500], Train Loss: 0.2838
Epoch [10/500], Train Loss: 0.2973
Epoch [11/500], Train Loss: 0.3025
Epoch [12/500], Train Loss: 0.2958
Epoch [13/500], Train Loss: 0.2171
Epoch [14/500], Train Loss: 0.1765
Epoch [15/500], Train Loss: 0.1625
Epoch [16/500], Train Loss: 0.1547
Epoch [17/500], Train Loss: 0.1507
Epoch [18/500], Train Loss: 0.1475
Epoch [19/500], Train Loss: 0.1453
Epoch [20/500], Train Loss: 0.1438
Epoch [21/500], Train Loss: 0.1431
Epoch [22/500], Train Loss: 0.1424
Epoch [23/500], Train Loss: 0.1418
Epoch [24/500], Train Loss: 0.1403
Epoch 

[I 2024-05-30 13:53:19,970] A new study created in RDB with name: chr8_study


Individual AUC ROC curves saved in: ../../../Data/model_results/logistic_regression/roc_curves/
Performance metrics saved at: ../../../Data/model_results/logistic_regression/csv_files/performance_metrics.csv
Total SNPs:  1614
PRS313 SNPs:  42
Total SNPs used for Training:  1572


[I 2024-05-30 13:53:41,452] Trial 8 finished with value: 17.74535541534424 and parameters: {'learning_rate': 0.0627810549564541, 'l1_coef': 1.369331145091169e-05, 'patience': 13, 'batch_size': 256}. Best is trial 8 with value: 17.74535541534424.
[I 2024-05-30 13:54:06,393] Trial 4 finished with value: 0.5366251051425934 and parameters: {'learning_rate': 0.08806299222279251, 'l1_coef': 0.008663605059545164, 'patience': 13, 'batch_size': 256}. Best is trial 4 with value: 0.5366251051425934.
[I 2024-05-30 13:54:08,626] Trial 6 finished with value: 0.5366137862205506 and parameters: {'learning_rate': 0.058834991339273834, 'l1_coef': 0.013411984672026773, 'patience': 13, 'batch_size': 256}. Best is trial 6 with value: 0.5366137862205506.
[I 2024-05-30 13:54:13,646] Trial 3 finished with value: 0.536030201613903 and parameters: {'learning_rate': 0.09166841542245097, 'l1_coef': 0.02900524354645823, 'patience': 9, 'batch_size': 128}. Best is trial 3 with value: 0.536030201613903.
[I 2024-05-30