In [2]:
import numpy as np

def calculate_iqs_unphased(true_genotypes, imputed_dosages):
    # Convert imputed dosages to discrete values
    imputed_discrete = np.round(imputed_dosages).astype(int)

    # Clip the imputed discrete values to be within the range of 0 to 2
    imputed_discrete = np.clip(imputed_discrete, 0, 2)

    # Create a contingency table
    contingency_table = np.zeros((3, 3), dtype=int)

    # Fill the contingency table
    for true_geno, imputed_geno in zip(true_genotypes, imputed_discrete):
        for true_allele, imputed_allele in zip(true_geno, imputed_geno):
            contingency_table[int(true_allele), int(imputed_allele)] += 1

    # Calculate the total number of genotypes
    total_genotypes = np.sum(contingency_table)

    # Calculate observed proportion of agreement (Po)
    observed_agreement = np.trace(contingency_table) / total_genotypes

    # Calculate marginal sums
    row_marginals = np.sum(contingency_table, axis=1)
    col_marginals = np.sum(contingency_table, axis=0)

    # Calculate chance agreement (Pc)
    chance_agreement = np.sum((row_marginals * col_marginals) / (total_genotypes ** 2))

    # Calculate IQS
    if chance_agreement == 1:  # To prevent division by zero in case of perfect chance agreement
        iqs_score = 0
    else:
        iqs_score = (observed_agreement - chance_agreement) / (1 - chance_agreement)

    return iqs_score

# Example usage:
true_genotypes = np.array([[0, 1, 2], [1, 2, 0], [2, 0, 1]])
imputed_dosages = np.array([[0.1, 1.2, 1.9], [1.0, 1.8, 0.3], [2.0, 0.5, 1.4]])

iqs_score = calculate_iqs_unphased(true_genotypes, imputed_dosages)
print(f"IQS Score: {iqs_score}")


IQS Score: 1.0


# Linear Regression Model

In [6]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'
start = 8

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Create folders for saving files
output_folder = "../../../Data/model_results_unphased_all_PRS/linear_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the linear regression model with L1 regularization
    class LinearRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LinearRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-10, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        lr_factor = trial.suggest_float('lr_factor', 0.1, 0.9)
        num_epochs = trial.suggest_int('num_epochs', 100, 500)

        model = LinearRegression(input_dim, output_dim, l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_factor, patience=patience, verbose=False)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs, batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)
    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"unphased_full_23andMe_chr{chromosome_number}_study_linear_regression"
    storage_name = f"sqlite:///optuna_studies/{study_name}.db"

    # Check if the study exists
    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")

    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name, load_if_exists= True)

    study.optimize(objective, n_trials=25, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']
    best_lr_factor = study.best_params['lr_factor']
    best_num_epochs = study.best_params['num_epochs']

    model = LinearRegression(input_dim, output_dim, best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=best_lr_factor, patience=best_patience, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(best_num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs, batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs.round()).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

    # Create a DataFrame to store the performance metrics for each chromosome
    performance_df = pd.DataFrame({
        'Chromosome': list(range(start, chromosome_number + 1)),
        'R2 Score': r2_scores,
        'IQS Score': iqs_scores,
        'Accuracy Score': accuracy_scores
    })

    # Save the performance metrics to a CSV file
    performance_csv_file = csv_folder + 'performance_metrics.csv'
    performance_df.to_csv(performance_csv_file, index=False)
    print(f"Performance metrics saved at: {performance_csv_file}")

Total SNPs:  799
PRS313 SNPs:  21
Total SNPs used for Training:  778




In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from torchmetrics.classification import BinaryAUROC
from sklearn.metrics import r2_score as sklearn_r2_score
from sklearn.metrics import roc_auc_score
import os
import csv

# Load and preprocess the data
data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []
auc_scores = []

# Folders for saved models and CSV files
output_folder = "../../../Data/model_results_unphased_all_PRS/linear_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"


def safe_auc_metric(y_pred, y_true):
    if y_true.sum() == 0:
        print("All true labels are negative. Returning NaN.")
        return np.NaN
    return auc_metric(y_pred, y_true)


for chromosome_number in range(start, 23):
    # Paths for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    file_name = data_directory + \
        f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(
        regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.float32)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Define the linear regression model with L1 regularization
    class LinearRegression(nn.Module):
        def __init__(self, input_dim, output_dim, l1_coef=0.0):
            super(LinearRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.l1_coef = l1_coef

        def forward(self, x):
            out = self.linear(x)
            return out

        def l1_loss(self):
            return self.l1_coef * torch.norm(self.linear.weight, p=1)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the final model
    model = LinearRegression(X_train_val.shape[1], y_train_val.shape[1])
    model_save_path = chr_model_folder + \
        f'final_model_chr{chromosome_number}.pth'
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)
    model.eval()

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_r2 = sklearn_r2_score(
            y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(
            y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs.round()).sum(
        ) / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Calculate the AUC for each SNP individually and then average them
        y_test_np = y_test.cpu().numpy()
        test_outputs_np = test_outputs.cpu().numpy()
        auc_metric = BinaryAUROC(thresholds=None)

        individual_aucs = [
            [
                safe_auc_metric(
                    (test_outputs[:, j].round() == i).float(), (y_test[:, j] == i).float())
                for i in range(3)
            ]
            for j in range(y_test_np.shape[1])
        ]
        # Flatten the list of individual AUCs
        flattened_aucs = [auc for sublist in individual_aucs for auc in sublist]

        # Calculate the mean while ignoring NaN values
        test_auc = np.nanmean(flattened_aucs)
        auc_scores.append(test_auc)

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(
            y_test_np, test_outputs_np, multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test_np[:, i].reshape(
            -1, 1), test_outputs_np[:, i].reshape(-1, 1)) for i in range(y_test_np.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + \
            f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + \
            f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

# Create a DataFrame to store the performance metrics for each chromosome
performance_df = pd.DataFrame({
    'Chromosome': list(range(start, 23)),
    'R2 Score': r2_scores,
    'IQS Score': iqs_scores,
    'Accuracy Score': accuracy_scores,
    'AUC Score': auc_scores
})

# Save the performance metrics to a CSV file
performance_csv_file = csv_folder + 'performance_metrics.csv'
performance_df.to_csv(performance_csv_file, index=False)
print(f"Performance metrics saved at: {performance_csv_file}")

ValueError: multiclass-multioutput is not supported

# Multinomial Logistic Regression Model

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import optuna
from matplotlib import pyplot as plt
import os
import csv

# Load and preprocess the data
data_directory = '../../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Create folders for saving files
output_folder = "../../../Data/model_results_unphased_all_PRS/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

os.makedirs(model_folder, exist_ok=True)
os.makedirs(csv_folder, exist_ok=True)

for chromosome_number in range(start, 23):
    # Create subfolders for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    os.makedirs(chr_model_folder, exist_ok=True)
    os.makedirs(chr_csv_folder, exist_ok=True)

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.long)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the multinomial logistic regression model with L1 regularization
    class MulticlassLogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, num_classes=3, l1_coef=0.0):
            super(MulticlassLogisticRegression, self).__init__()
            self.num_classes = num_classes
            self.linear = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_classes)])
            self.l1_coef = l1_coef

        def forward(self, x):
            out = torch.stack([linear(x) for linear in self.linear], dim=-1)
            return out

        def l1_loss(self):
            return self.l1_coef * sum(torch.norm(linear.weight, p=1) for linear in self.linear)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters for tuning
    input_dim = X_train_val.shape[1]
    output_dim = y_train_val.shape[1]
    num_epochs = 500

    # Define the objective function for Optuna with cross-validation and early stopping
    def objective(trial):
        learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-1, log=True)
        l1_coef = trial.suggest_float('l1_coef', 1e-10, 1e-1, log=True)
        patience = trial.suggest_int('patience', 5, 20)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
        lr_factor = trial.suggest_float('lr_factor', 0.1, 0.9)
        num_epochs = trial.suggest_int('num_epochs', 100, 500)

        model = MulticlassLogisticRegression(input_dim, output_dim, l1_coef=l1_coef).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=lr_factor, patience=patience, verbose=False)

        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        fold_losses = []

        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_val)):
            X_train, X_val = X_train_val[train_idx], X_train_val[val_idx]
            y_train, y_val = y_train_val[train_idx], y_train_val[val_idx]

            train_dataset = TensorDataset(X_train, y_train)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
            best_val_loss = float('inf')
            counter = 0

            for epoch in range(num_epochs):
                train_loss = 0.0
                for batch_X, batch_y in train_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X)
                    loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    train_loss += loss.item()

                train_loss /= len(train_loader)

                val_dataset = TensorDataset(X_val, y_val)
                val_loader = DataLoader(val_dataset, batch_size=batch_size)

                with torch.no_grad():
                    val_loss = 0.0
                    for batch_X, batch_y in val_loader:
                        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                        outputs = model(batch_X)
                        loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()
                        val_loss += loss.item()

                    val_loss /= len(val_loader)
                    scheduler.step(val_loss)

                    if val_loss < best_val_loss:
                        best_val_loss = val_loss
                        counter = 0
                    else:
                        counter += 1

                    if counter >= patience:
                        break

            fold_losses.append(best_val_loss)

        return np.mean(fold_losses)
    
    # Create the "optuna_studies" folder if it doesn't exist
    os.makedirs("optuna_studies_logistic", exist_ok=True)

    # Create an Optuna study and optimize the hyperparameters
    study_name = f"unphased_full_23andMe_chr{chromosome_number}_study_logistic_regression"
    storage_name = f"sqlite:///optuna_studies_logistic/{study_name}.db"

    # Check if the study exists
    current_dir = os.getcwd()
    study_exists = os.path.exists(current_dir + f"/optuna_studies/{study_name}.db")

    if study_exists:
        # Load the existing study
        study = optuna.load_study(study_name=study_name, storage=storage_name)
    else:
        # Create a new study
        study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name, load_if_exists= True)

    study.optimize(objective, n_trials=25, n_jobs=-1)

    # Print the best hyperparameters and best value
    print(f"Chr {chromosome_number} - Best hyperparameters: {study.best_params}")
    print(f"Chr {chromosome_number} - Best value: {study.best_value:.4f}")

    # Train the final model with the best hyperparameters and early stopping
    best_learning_rate = study.best_params['learning_rate']
    best_l1_coef = study.best_params['l1_coef']
    best_patience = study.best_params['patience']
    best_batch_size = study.best_params['batch_size']
    best_lr_factor = study.best_params['lr_factor']
    best_num_epochs = study.best_params['num_epochs']

    model = MulticlassLogisticRegression(input_dim, output_dim, l1_coef=best_l1_coef).to(device)
    optimizer = optim.Adam(model.parameters(), lr=best_learning_rate)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=best_lr_factor, patience=best_patience, verbose=False)

    train_dataset = TensorDataset(X_train_val, y_train_val)
    train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

    best_train_loss = float('inf')
    counter = 0

    for epoch in range(best_num_epochs):
        train_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            outputs = model(batch_X)
            loss = criterion(outputs.transpose(1, 2), batch_y) + model.l1_loss()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        if train_loss < best_train_loss:
            best_train_loss = train_loss
            counter = 0
        else:
            counter += 1

        if counter >= best_patience:
            break

        scheduler.step(train_loss)

    # Save the final model
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    torch.save(model.state_dict(), model_save_path)
    print(f"Final model saved at: {model_save_path}")

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device)).softmax(dim=2)

        test_outputs = test_outputs.argmax(dim=-1)
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())

        test_accuracy = ((y_test == test_outputs).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

    # Create a DataFrame to store the performance metrics for each chromosome
    performance_df = pd.DataFrame({
        'Chromosome': list(range(start, chromosome_number + 1)),
        'R2 Score': r2_scores,
        'IQS Score': iqs_scores,
        'Accuracy Score': accuracy_scores
    })

    # Save the performance metrics to a CSV file
    performance_csv_file = csv_folder + 'performance_metrics.csv'
    performance_df.to_csv(performance_csv_file, index=False)
    print(f"Performance metrics saved at: {performance_csv_file}")

  from .autonotebook import tqdm as notebook_tqdm


Total SNPs:  2028
PRS313 SNPs:  30
Total SNPs used for Training:  1998


[I 2024-06-04 13:20:51,231] Using an existing study with name 'unphased_full_23andMe_chr1_study_logistic_regression' instead of creating a new one.
[I 2024-06-04 13:21:36,581] Trial 35 finished with value: 0.1114211019128561 and parameters: {'learning_rate': 0.007940241337314375, 'l1_coef': 9.327724150439032e-10, 'patience': 8, 'batch_size': 256, 'lr_factor': 0.39667673034397594, 'num_epochs': 279}. Best is trial 9 with value: 0.06694057776476256.
[I 2024-06-04 13:21:36,868] Trial 40 finished with value: 0.1112546781077981 and parameters: {'learning_rate': 0.007208288524397599, 'l1_coef': 4.221374809935948e-07, 'patience': 8, 'batch_size': 256, 'lr_factor': 0.3820047334456536, 'num_epochs': 261}. Best is trial 9 with value: 0.06694057776476256.
[I 2024-06-04 13:21:37,304] Trial 41 finished with value: 0.11770565640181303 and parameters: {'learning_rate': 0.0060917892966917235, 'l1_coef': 2.533757727491704e-07, 'patience': 8, 'batch_size': 256, 'lr_factor': 0.38336430185963255, 'num_epo

KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as sklearn_r2_score, accuracy_score
import os
import csv

# Load and preprocess the data
data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
start = 1

# Initialize lists to store the performance metrics for each chromosome
r2_scores = []
iqs_scores = []
accuracy_scores = []

# Folders for saved models and CSV files
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/"
model_folder = output_folder + "models_unphased/"
csv_folder = output_folder + "csv_files/"

def calculate_iqs_unphased(y_true, y_pred):
    # Dummy implementation of IQS calculation, replace with the actual function
    return np.mean(np.abs(y_true - y_pred))

for chromosome_number in range(start, 23):
    # Paths for the current chromosome
    chr_model_folder = model_folder + f"chr{chromosome_number}/"
    chr_csv_folder = csv_folder + f"chr{chromosome_number}/"

    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*PRS313_)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='PRS313_').values, dtype=torch.long)

    print("Total SNPs: ", data.shape[1])
    print("PRS313 SNPs: ", y.shape[1])
    print("Total SNPs used for Training: ", X.shape[1])

    # Split the data into train-validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the multinomial logistic regression model with L1 regularization
    class MulticlassLogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim, num_classes=3, l1_coef=0.0):
            super(MulticlassLogisticRegression, self).__init__()
            self.num_classes = num_classes
            self.linear = nn.ModuleList([nn.Linear(input_dim, output_dim) for _ in range(num_classes)])
            self.l1_coef = l1_coef

        def forward(self, x):
            out = torch.stack([linear(x) for linear in self.linear], dim=-1)
            out = nn.functional.softmax(out, dim=-1)
            return out

        def l1_loss(self):
            return self.l1_coef * sum(torch.norm(linear.weight, p=1) for linear in self.linear)

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the final model
    model = MulticlassLogisticRegression(X_train_val.shape[1], y_train_val.shape[1])
    model_save_path = chr_model_folder + f'final_model_chr{chromosome_number}.pth'
    model.load_state_dict(torch.load(model_save_path, map_location=device))
    model.to(device)
    model.eval()

    # Evaluate the final model on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        test_outputs = test_outputs.argmax(dim=-1)
        test_r2 = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_iqs = calculate_iqs_unphased(y_test.cpu().numpy(), test_outputs.cpu().numpy())
        test_accuracy = ((y_test == test_outputs).sum() / (test_outputs.shape[0] * test_outputs.shape[1])).numpy()

        # Append performance metrics to the lists
        r2_scores.append(test_r2)
        iqs_scores.append(test_iqs)
        accuracy_scores.append(test_accuracy)

        # Calculate individual R^2 scores for each SNP
        individual_r2_scores = sklearn_r2_score(y_test.cpu().numpy(), test_outputs.cpu().numpy(), multioutput='raw_values')

        # Calculate individual IQS scores for each SNP
        individual_iqs_scores = np.array([calculate_iqs_unphased(y_test.cpu().numpy()[:, i].reshape(-1, 1), test_outputs.cpu().numpy()[:, i].reshape(-1, 1)) for i in range(y_test.shape[1])])

        # Get the names of the SNPs from the original dataframe
        snp_names = data.filter(regex='PRS').columns

        # Save individual R^2 scores to a CSV file
        csv_file = chr_csv_folder + f'individual_r2_scores_chr{chromosome_number}.csv'

        with open(csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'R2 Score'])
            for snp, r2_score in zip(snp_names, individual_r2_scores):
                writer.writerow([snp, r2_score])

        print(f"Individual R^2 scores saved at: {csv_file}")

        # Save individual IQS scores to a CSV file
        iqs_csv_file = chr_csv_folder + f'individual_iqs_scores_chr{chromosome_number}.csv'

        with open(iqs_csv_file, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['SNP', 'IQS Score'])
            for snp, iqs_score in zip(snp_names, individual_iqs_scores):
                writer.writerow([snp, iqs_score])

        print(f"Individual IQS scores saved at: {iqs_csv_file}")

# Create a DataFrame to store the performance metrics for each chromosome
performance_df = pd.DataFrame({
    'Chromosome': list(range(start, 23)),
    'R2 Score': r2_scores,
    'IQS Score': iqs_scores,
    'Accuracy Score': accuracy_scores
})

# Save the performance metrics to a CSV file
performance_csv_file = csv_folder + 'performance_metrics.csv'
performance_df.to_csv(performance_csv_file, index=False)
print(f"Performance metrics saved at: {performance_csv_file}")


Total SNPs:  929
PRS313 SNPs:  30
Total SNPs used for Training:  899
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr1/individual_r2_scores_chr1.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr1/individual_iqs_scores_chr1.csv
Total SNPs:  629
PRS313 SNPs:  21
Total SNPs used for Training:  608
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_r2_scores_chr2.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr2/individual_iqs_scores_chr2.csv
Total SNPs:  963
PRS313 SNPs:  16
Total SNPs used for Training:  947
Individual R^2 scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/chr3/individual_r2_scores_chr3.csv
Individual IQS scores saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/cs

In [None]:
# Loop through all the training datasets and document the PRS313 SNPs in each dataset. Save this to a CSV file.

import pandas as pd
import os

data_directory = '../../Data/Filtered_unphased_training_data_union_final/'
output_folder = "../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize a list to store the PRS313 SNPs in each dataset
prs313_snps = []

for chromosome_number in range(1, 23):
    file_name = data_directory + \
        f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_combined.parquet"
    data = pd.read_parquet(file_name)

    prs313_snps.append(data.filter(regex='PRS313_').columns)

# Create a DataFrame to store the PRS313 SNPs in each dataset
prs313_df = pd.DataFrame({
    'Chromosome': list(range(1, 23)),
    'PRS313 SNPs': prs313_snps,
    "Number of PRS313 SNPs": [len(snps) for snps in prs313_snps]
})

# Save the PRS313 SNPs to a CSV file
prs313_csv_file = output_folder + 'prs313_snps.csv'
prs313_df.to_csv(prs313_csv_file, index=False)
print(f"PRS313 SNPs saved at: {prs313_csv_file}")

# Print the total number of PRS313 SNPs in all datasets
total_prs313_snps = sum(prs313_df["Number of PRS313 SNPs"])
print(f"Total number of PRS313 SNPs: {total_prs313_snps}")



PRS313 SNPs saved at: ../../Data/model_results_unphased_all_PRS/logistic_regression/csv_files/prs313_snps.csv
Total number of PRS313 SNPs: 313
