# Random Forest

In [43]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.tensorboard import SummaryWriter
import os

# Define the updated neural network architecture
class SimpleFFNN(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(SimpleFFNN, self).__init__()
        self.hidden1 = nn.Linear(input_size, hidden_size1)
        self.hidden2 = nn.Linear(hidden_size1, hidden_size2)
        self.output = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = torch.relu(self.hidden1(x))
        x = torch.relu(self.hidden2(x))
        x = torch.sigmoid(self.output(x))
        return x

def train_and_evaluate_model(model, criterion, optimizer, scheduler, train_loader, val_loader, num_epochs, device, writer, checkpoint_path):
    best_val_loss = float('inf')
    start_epoch = 0

    # Load checkpoint if it exists
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        best_val_loss = checkpoint['best_val_loss']
        print(f"Loaded checkpoint from {checkpoint_path}. Starting from epoch {start_epoch + 1}")

    for epoch in range(start_epoch, num_epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_loss = 0
            val_outputs = []
            val_labels = []
            for batch_X, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                outputs = model(batch_X)
                val_loss += criterion(outputs, batch_y).item()
                val_outputs.extend(outputs.cpu().numpy())
                val_labels.extend(batch_y.cpu().numpy())
            
            val_loss /= len(val_loader)
            scheduler.step(val_loss)  # Update the learning rate based on validation loss
            val_outputs = np.array(val_outputs)
            val_labels = np.array(val_labels)
            val_accuracy = ((val_outputs > 0.5) == val_labels).mean()
            # val_accuracy = accuracy_score((val_labels == 1), np.round(val_outputs))
            # val_roc_auc = roc_auc_score(val_labels, val_outputs, average="macro")
            
            writer.add_scalar('Val_Loss', val_loss, epoch)
            writer.add_scalar('Val_Accuracy', val_accuracy, epoch)
            # writer.add_scalar('Val_ROC_AUC', val_roc_auc, epoch)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'best_val_loss': best_val_loss
                }, checkpoint_path)
                print("Validation Accuracy is ", val_accuracy)
                print(f"Epoch {epoch+1}: Val Loss improved to {val_loss:.4f}, saved checkpoint")
            else:
                print(f"Epoch {epoch+1}: Val Loss did not improve from {best_val_loss:.4f}")
    
    return best_val_loss

# Load and preprocess the data
data_directory = '../../Data/Filtered_split_training_data/'
chromosome_number = 8
file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_split.parquet"
data = pd.read_parquet(file_name)

print("Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Unknown" in col]].shape[1])
print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])

# Split the data into train and validation sets
X = torch.tensor(data.filter(regex='^(?!.*Unknown)').values, dtype=torch.float32)
y = torch.tensor(data.filter(regex='Unknown').values, dtype=torch.float32)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=128)

# Set up the model, loss function, optimizer, and learning rate scheduler
input_size = X_train.shape[1]
hidden_size1 = 20
hidden_size2 = 20
output_size = y_train.shape[1]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleFFNN(input_size, hidden_size1, hidden_size2, output_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.01, patience=5, verbose=True)

# Train the model
num_epochs = 100
checkpoint_path = f'checkpoint_chr{chromosome_number}.pth'
writer = SummaryWriter()
best_val_loss = train_and_evaluate_model(model, criterion, optimizer, scheduler, train_loader, val_loader, num_epochs, device, writer, checkpoint_path)
writer.close()

# Load the best model and evaluate on the validation set
best_model = SimpleFFNN(input_size, hidden_size1, hidden_size2, output_size).to(device)
checkpoint = torch.load(checkpoint_path)
best_model.load_state_dict(checkpoint['model_state_dict'])
best_model.eval()

with torch.no_grad():
    val_accuracies = []
    
    for batch_X, batch_y in val_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = best_model(batch_X)
        val_accuracy = ((outputs > 0.5) == batch_y.to(device)).float().mean()

        val_accuracies.append(val_accuracy.cpu())

    print(np.mean(val_accuracies))
    # val_roc_auc = roc_auc_score(val_labels, val_outputs, average="macro")
    
    print(f"Best Model - Val Accuracy: {val_accuracy:.4f}")

Unknown PRS313 SNPs:  28
Known PRS313 SNPs:  14
23AndMe SNPs with LD to Unknown PRS313 SNPs:  1262
Validation Accuracy is  0.6070715711434275
Epoch 1: Val Loss improved to 0.6113, saved checkpoint
Validation Accuracy is  0.7159965782720273
Epoch 2: Val Loss improved to 0.5567, saved checkpoint
Validation Accuracy is  0.7318220701454234
Epoch 3: Val Loss improved to 0.5314, saved checkpoint
Validation Accuracy is  0.7322497861420018
Epoch 4: Val Loss improved to 0.5212, saved checkpoint
Validation Accuracy is  0.7391645280866838
Epoch 5: Val Loss improved to 0.5160, saved checkpoint
Validation Accuracy is  0.7403763900769889
Epoch 6: Val Loss improved to 0.5123, saved checkpoint
Validation Accuracy is  0.7432278300541774
Epoch 7: Val Loss improved to 0.5087, saved checkpoint
Validation Accuracy is  0.7517821499857428
Epoch 8: Val Loss improved to 0.5051, saved checkpoint
Validation Accuracy is  0.7542771599657827
Epoch 9: Val Loss improved to 0.5018, saved checkpoint
Validation Accuracy

In [51]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Load and preprocess the data
data_directory = '../../Data/Filtered_split_training_data/'
chromosome_number = 1

for chromosome_number in range (5,23):
    file_name = data_directory + f"23AndMe_PRS313_merged_chr{chromosome_number}_matching_split.parquet"
    data = pd.read_parquet(file_name)
    print("Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Unknown" in col]].shape[1])
    print("Known PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_Known" in col]].shape[1])
    print("23AndMe SNPs with LD to Unknown PRS313 SNPs: ", data[[col for col in data.columns if "PRS313_" not in col]].shape[1])

    # Split the data into features and target
    X = torch.tensor(data.filter(regex='^(?!.*Unknown)').values, dtype=torch.float32)
    y = torch.tensor(data.filter(regex='Unknown').values, dtype=torch.float32)

    # Define the logistic regression model
    class LogisticRegression(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(input_dim, output_dim)
            self.sigmoid = nn.Sigmoid()

        def forward(self, x):
            out = self.linear(x)
            out = self.sigmoid(out)
            return out

    # Set the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set the hyperparameters
    input_dim = X.shape[1]
    output_dim = y.shape[1]
    learning_rate = 0.001
    num_epochs = 400
    batch_size = 128
    num_folds = 5

    # Define the loss function
    criterion = nn.BCELoss()

    # K-fold cross-validation
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_accuracies = []
    fold_precisions = []
    fold_recalls = []
    fold_f1_scores = []
    fold_roc_auc_scores = []
    fold_train_losses = []
    fold_val_losses = []



    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"Fold {fold + 1}/{num_folds}")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        val_dataset = TensorDataset(X_val, y_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        model = LogisticRegression(input_dim, output_dim).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        
        for epoch in range(num_epochs):
            train_loss = 0.0
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
            
            train_loss /= len(train_loader)
            fold_train_losses.append(train_loss)
        
        with torch.no_grad():
            val_outputs = model(X_val.to(device))
            val_loss = criterion(val_outputs, y_val.to(device))
            fold_val_losses.append(val_loss.item())
            
            val_preds = (val_outputs > 0.5).float()
            val_accuracy = ((val_preds > 0.5) == y_val).float().mean()
            val_precision = precision_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_recall = recall_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_f1 = f1_score(y_val.cpu().numpy(), val_preds.cpu().numpy(), average='micro')
            val_roc_auc = roc_auc_score(y_val.cpu().numpy(), val_outputs.cpu().numpy(), average='micro')
            
            fold_accuracies.append(val_accuracy)
            fold_precisions.append(val_precision)
            fold_recalls.append(val_recall)
            fold_f1_scores.append(val_f1)
            fold_roc_auc_scores.append(val_roc_auc)
            
            print(f"Fold {fold + 1}/{num_folds}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1: {val_f1:.4f}, Val ROC AUC: {val_roc_auc:.4f}")

    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f} +/- {np.std(fold_accuracies):.4f}")
    print(f"Average Precision: {np.mean(fold_precisions):.4f} +/- {np.std(fold_precisions):.4f}")
    print(f"Average Recall: {np.mean(fold_recalls):.4f} +/- {np.std(fold_recalls):.4f}")
    print(f"Average F1 Score: {np.mean(fold_f1_scores):.4f} +/- {np.std(fold_f1_scores):.4f}")
    print(f"Average ROC AUC: {np.mean(fold_roc_auc_scores):.4f} +/- {np.std(fold_roc_auc_scores):.4f}")

    import csv

    # Export results to CSV

    output_folder = "../../Data/model_results/logistic_regression/"

    csv_file = output_folder + f'cross_validation_results_chr{chromosome_number}.csv'
    fieldnames = ['Fold', 'Train Loss', 'Val Loss', 'Val Accuracy', 'Val Precision', 'Val Recall', 'Val F1', 'Val ROC AUC']

    with open(csv_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for fold in range(num_folds):
            writer.writerow({
                'Fold': fold + 1,
                'Train Loss': fold_train_losses[fold],
                'Val Loss': fold_val_losses[fold],
                'Val Accuracy': fold_accuracies[fold],
                'Val Precision': fold_precisions[fold],
                'Val Recall': fold_recalls[fold],
                'Val F1': fold_f1_scores[fold],
                'Val ROC AUC': fold_roc_auc_scores[fold]
            })

        writer.writerow({})  # Empty row for separation
        writer.writerow({
            'Fold': 'Average',
            'Train Loss': np.mean(fold_train_losses),
            'Val Loss': np.mean(fold_val_losses),
            'Val Accuracy': np.mean(fold_accuracies),
            'Val Precision': np.mean(fold_precisions),
            'Val Recall': np.mean(fold_recalls),
            'Val F1': np.mean(fold_f1_scores),
            'Val ROC AUC': np.mean(fold_roc_auc_scores)
        })
        writer.writerow({
            'Fold': 'Std Dev',
            'Train Loss': np.std(fold_train_losses),
            'Val Loss': np.std(fold_val_losses),
            'Val Accuracy': np.std(fold_accuracies),
            'Val Precision': np.std(fold_precisions),
            'Val Recall': np.std(fold_recalls),
            'Val F1': np.std(fold_f1_scores),
            'Val ROC AUC': np.std(fold_roc_auc_scores)
        })

    print(f"Results exported to {csv_file}")

Unknown PRS313 SNPs:  54
Known PRS313 SNPs:  14
23AndMe SNPs with LD to Unknown PRS313 SNPs:  1850


KeyboardInterrupt: 

In [None]:
import numpy as np
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    train_loss = 0
    for batch in dataloader:
        X_batch, y_batch, mask_batch = batch
        X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)
        
        optimizer.zero_grad()
        output = model(X_batch, mask_batch)
        loss = criterion(output[mask_batch], y_batch[mask_batch])
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    return train_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            X_batch, y_batch, mask_batch = batch
            X_batch, y_batch, mask_batch = X_batch.to(device), y_batch.to(device), mask_batch.to(device)
            
            output = model(X_batch, mask_batch)
            loss = criterion(output[mask_batch], y_batch[mask_batch])
            
            val_loss += loss.item()
    return val_loss / len(dataloader)

def run_training(X_train, y_train, X_val, y_val, hidden_dim, num_layers, num_heads, dropout, dim_feedforward, batch_size, learning_rate, num_epochs, device):
    input_dim = X_train.shape[1]
    
    train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train), torch.tensor(np.random.rand(*X_train.shape) < 0.2))
    val_dataset = TensorDataset(torch.tensor(X_val), torch.tensor(y_val), torch.tensor(np.random.rand(*X_val.shape) < 0.2))
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    model = TransformerImputer(input_dim, hidden_dim, num_layers, num_heads, dropout, dim_feedforward).to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = BCEWithLogitsLoss()
    
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        train_loss = train(model, train_dataloader, optimizer, criterion, device)
        val_loss = evaluate(model, val_dataloader, criterion, device)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')

# Hyperparameter tuning with 10-fold cross-validation
hidden_dims = [64, 128, 256]
num_layers_options = [2, 4, 6]
num_heads_options = [2, 4, 8]
dropout_options = [0.1, 0.2, 0.3]
dim_feedforward_options = [128, 256, 512]  # Add dim_feedforward options
batch_sizes = [32, 64, 128]
learning_rates = [1e-3, 1e-4, 1e-5]
num_epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_hyperparams = None
best_val_loss = float('inf')

for hidden_dim in hidden_dims:
    for num_layers in num_layers_options:
        for num_heads in num_heads_options:
            for dropout in dropout_options:
                for dim_feedforward in dim_feedforward_options:  # Add dim_feedforward loop
                    for batch_size in batch_sizes:
                        for learning_rate in learning_rates:
                            print(f"Hidden Dim: {hidden_dim}, Num Layers: {num_layers}, Num Heads: {num_heads}, Dropout: {dropout}, Dim Feedforward: {dim_feedforward}, Batch Size: {batch_size}, Learning Rate: {learning_rate}")
                            
                            val_losses = []
                            for train_idx, test_idx in kfold.split(X):
                                X_train, X_test = X[train_idx], X[test_idx]
                                y_train, y_test = y[train_idx], y[test_idx]
                                
                                X_val, X_test = X_test[:len(X_test)//2], X_test[len(X_test)//2:]
                                y_val, y_test = y_test[:len(y_test)//2], y_test[len(y_test)//2:]
                                
                                run_training(X_train, y_train, X_val, y_val, hidden_dim, num_layers, num_heads, dropout, dim_feedforward, batch_size, learning_rate, num_epochs, device)
                                
                                model = TransformerImputer(input_dim, hidden_dim, num_layers, num_heads, dropout, dim_feedforward).to(device)
                                model.load_state_dict(torch.load('best_model.pt'))
                                
                                test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test), torch.tensor(np.random.rand(*X_test.shape) < 0.2))
                                test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
                                
                                val_loss = evaluate(model, test_dataloader, criterion, device)
                                val_losses.append(val_loss)
                            
                            mean_val_loss = np.mean(val_losses)
                            print(f"Mean Val Loss: {mean_val_loss:.4f}")
                            
                            if mean_val_loss < best_val_loss:
                                best_val_loss = mean_val_loss
                                best_hyperparams = (hidden_dim, num_layers, num_heads, dropout, dim_feedforward, batch_size, learning_rate)

print(f"Best Hyperparameters: Hidden Dim: {best_hyperparams[0]}, Num Layers: {best_hyperparams[1]}, Num Heads: {best_hyperparams[2]}, Dropout: {best_hyperparams[3]}, Dim Feedforward: {best_hyperparams[4]}, Batch Size: {best_hyperparams[5]}, Learning Rate: {best_hyperparams[6]}")

Hidden Dim: 64, Num Layers: 2, Num Heads: 2, Dropout: 0.1, Dim Feedforward: 128, Batch Size: 32, Learning Rate: 0.001




RuntimeError: mat1 and mat2 must have the same dtype, but got Bool and Float