In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import r2_score as sklearn_r2_score


# Load and preprocess the data
data_directory = '../../Data/Filtered_split_training_data/'
chromosome_number = 1

hidden_size1 = 150
hidden_size2 = 150

for chromosome_number in range (1,23):
    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_split.parquet"
    data = pd.read_parquet(file_name)
    print("Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Unknown" in col]].shape[1])
    print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
    print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*Unknown)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='Unknown').values, dtype=torch.float32)

        
        # Define the updated neural network architecture
    class SimpleFFNN(nn.Module):
        def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
            super(SimpleFFNN, self).__init__()
            self.hidden1 = nn.Linear(input_size, hidden_size1)
            self.hidden2 = nn.Linear(hidden_size1, hidden_size2)
            self.output = nn.Linear(hidden_size2, output_size)

        def forward(self, x):
            x = torch.relu(self.hidden1(x))
            x = torch.relu(self.hidden2(x))
            x = torch.sigmoid(self.output(x))
            return x

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters
    input_dim = X.shape[1]
    output_dim = y.shape[1]
    learning_rate = 0.001
    num_epochs = 400
    batch_size = 128
    num_folds = 5

    # Define the loss function
    criterion = nn.BCELoss()

    # K-fold cross-validation
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_accuracies = []
    fold_precisions = []
    fold_recalls = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_r2_scores = []
    fold_train_losses = []
    fold_val_losses = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"Fold {fold + 1}/{num_folds}")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        val_dataset = TensorDataset(X_val, y_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        model = SimpleFFNN(input_dim,hidden_size1, hidden_size2, output_dim).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        for epoch in range(num_epochs):
            train_loss = 0.0
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            train_loss /= len(train_loader)
            fold_train_losses.append(train_loss)
        
        with torch.no_grad():
            val_outputs = model(X_val.to(device))
            val_loss = criterion(val_outputs, y_val.to(device))
            fold_val_losses.append(val_loss.item())
            
            val_preds = (val_outputs > 0.5).float()
            val_accuracy = float(((val_preds > 0.5) == y_val).float().mean())
            val_precision = precision_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_recall = recall_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_f1 = f1_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_roc_auc = roc_auc_score(y_val.cpu().numpy(), val_outputs.cpu().numpy(), average='micro')
            val_r2 = sklearn_r2_score(y_val.cpu().numpy(), val_outputs.cpu().numpy())
            val_iqs = (y_val.cpu().numpy(), val_preds.cpu().numpy())

            fold_accuracies.append(val_accuracy)
            fold_precisions.append(val_precision)
            fold_recalls.append(val_recall)
            fold_f1_scores.append(val_f1)
            fold_roc_auc_scores.append(val_roc_auc)
            fold_r2_scores.append(val_r2)

            print(f"Fold {fold + 1}/{num_folds}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val ROC AUC: {val_roc_auc:.4f}, Val R2: {val_r2:.4f}")


    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} +/- {np.std(fold_accuracies):.4f}")
    print(f"Average Precision: {np.mean(fold_precisions):.4f} +/- {np.std(fold_precisions):.4f}")
    print(f"Average Recall: {np.mean(fold_recalls):.4f} +/- {np.std(fold_recalls):.4f}")
    print(f"Average F1 Score: {np.mean(fold_f1_scores):.4f} +/- {np.std(fold_f1_scores):.4f}")
    print(f"Average ROC AUC: {np.mean(fold_roc_auc_scores):.4f} +/- {np.std(fold_roc_auc_scores):.4f}")
    print(f"Average R2 Score: {np.mean(fold_r2_scores):.4f} +/- {np.std(fold_r2_scores):.4f}")
    print(f"Average IQS Score: {np.mean(fold_iqs_scores):.4f} +/- {np.std(fold_iqs_scores):.4f}")

    import csv

    # Export results to CSV

    output_folder = "../../Data/model_results/logistic_regression/"

    csv_file = output_folder + f'cross_validation_results_chr{chromosome_number}.csv'
    fieldnames = ['Fold', 'Train Loss', 'Val Loss', 'Val Accuracy', 'Val Precision', 'Val Recall', 'Val F1', 'Val ROC AUC', 'Val R2']

    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for fold in range(num_folds):
            writer.writerow({
                'Fold': fold + 1,
                'Train Loss': fold_train_losses[fold],
                'Val Loss': fold_val_losses[fold],
                'Val Accuracy': fold_accuracies[fold],
                'Val Precision': fold_precisions[fold],
                'Val Recall': fold_recalls[fold],
                'Val F1': fold_f1_scores[fold],
                'Val ROC AUC': fold_roc_auc_scores[fold],
                'Val R2': fold_r2_scores[fold]
            })

        writer.writerow({})  # Empty row for separation
        writer.writerow({
            'Fold': 'Average',
            'Train Loss': np.mean(fold_train_losses),
            'Val Loss': np.mean(fold_val_losses),
            'Val Accuracy': np.mean(fold_accuracies),
            'Val Precision': np.mean(fold_precisions),
            'Val Recall': np.mean(fold_recalls),
            'Val F1': np.mean(fold_f1_scores),
            'Val ROC AUC': np.mean(fold_roc_auc_scores),
            'Val R2': np.mean(fold_r2_scores)
        })
        writer.writerow({
            'Fold': 'Std Dev',
            'Train Loss': np.std(fold_train_losses),
            'Val Loss': np.std(fold_val_losses),
            'Val Accuracy': np.std(fold_accuracies),
            'Val Precision': np.std(fold_precisions),
            'Val Recall': np.std(fold_recalls),
            'Val F1': np.std(fold_f1_scores),
            'Val ROC AUC': np.std(fold_roc_auc_scores),
            'Val R2': np.std(fold_r2_scores)
        })

    print(f"Results exported to {csv_file}")