In [33]:
import pandas as pd

df = pd.read_csv('concatenated_snps_processed.csv')

In [34]:
df

Unnamed: 0,chr2:192381934:C:T_maternal,chr2:192381934:C:T_paternal,chr8:129199566:G:A_maternal,chr8:129199566:G:A_paternal,chr8:29509616:A:C_maternal,chr8:29509616:A:C_paternal,chr2:121089731:T:C_maternal,chr2:121089731:T:C_paternal,chr15:75750383:T:C_maternal,chr15:75750383:T:C_paternal,...,"chr5:176134882:T:A,C_maternal","chr5:176134882:T:A,C_paternal","chr5:52679539:C:CA,CAA_maternal","chr5:52679539:C:CA,CAA_paternal","chr7:91459189:A:AT,ATT_maternal","chr7:91459189:A:AT,ATT_paternal","chr10:22861490:A:C,T_maternal","chr10:22861490:A:C,T_paternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_maternal","chr22:38583315:AAAAG:AAAAGAAAG,AAAAGAAAGAAAG,A_paternal"
HG00096,True,True,False,True,True,True,True,False,True,False,...,False,True,False,False,True,False,True,True,False,False
HG00097,True,True,True,False,True,True,False,False,False,False,...,True,False,False,False,True,False,True,True,False,False
HG00099,True,True,False,False,False,False,False,True,False,False,...,False,False,False,False,True,True,True,True,True,True
HG00100,False,True,False,False,False,True,False,False,False,False,...,True,True,False,False,False,True,False,True,False,False
HG00101,True,True,False,False,True,True,True,False,True,True,...,True,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,True,True,False,False,True,True,False,False,False,False,...,True,False,False,False,False,False,True,True,True,False
NA21141,True,True,False,False,False,False,False,False,False,True,...,True,True,False,False,True,True,True,True,False,True
NA21142,True,False,True,False,False,True,False,False,True,False,...,True,True,False,False,False,True,True,True,False,True
NA21143,True,True,False,False,True,True,False,False,True,True,...,False,False,True,False,True,False,False,True,True,False


In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Define the transformer model architecture
class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, dropout):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=hidden_size * 4,
            dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )
        self.output = nn.Linear(hidden_size, input_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

In [42]:
# Set hyperparameters
input_size = 626
hidden_size = 256
num_layers = 3
num_heads = 8
dropout = 0.1
batch_size = 32
num_epochs = 50
learning_rate = 0.001
masking_percentage = 0.2

# Set the random seed for reproducibility
torch.manual_seed(42)


# Prepare the data
def mask_data(data, masking_percentage):
    mask = torch.rand(data.shape) < masking_percentage
    masked_data = data.clone()
    masked_data[mask] = -1  # Set masked values to -1 or any other appropriate value
    return masked_data, mask, data

# Assuming you have the data frame loaded as 'df'
data = torch.tensor(df.values, dtype=torch.float32)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_masked_data, train_mask, train_data = mask_data(train_data, masking_percentage)
val_masked_data, val_mask, val_data = mask_data(val_data, masking_percentage)

train_dataset = TensorDataset(train_masked_data, train_mask, train_data)
val_dataset = TensorDataset(val_masked_data, val_mask, val_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model, loss function, and optimizer
model = TransformerModel(input_size, hidden_size, num_layers, num_heads, dropout)
criterion = nn.BCEWithLogitsLoss(reduction='none')  # Use BCEWithLogitsLoss for numerical stability
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_preds = []
    train_targets = []
    for batch_masked_data, batch_mask, batch_data in train_loader:
        optimizer.zero_grad()
        output = model(batch_masked_data)
        loss = criterion(output[batch_mask], batch_data[batch_mask])
        loss = loss.mean()  # Calculate average loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_preds.extend((output[batch_mask].sigmoid() > 0.5).cpu().numpy().flatten())
        train_targets.extend(batch_data[batch_mask].cpu().numpy().flatten())
    
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch_masked_data, batch_mask, batch_data in val_loader:
            output = model(batch_masked_data)
            loss = criterion(output[batch_mask], batch_data[batch_mask])
            loss = loss.mean()  # Calculate average loss
            val_loss += loss.item()
            val_preds.extend((output[batch_mask].sigmoid() > 0.5).cpu().numpy().flatten())
            val_targets.extend(batch_data[batch_mask].cpu().numpy().flatten())
    
    train_acc = accuracy_score(train_targets, train_preds)
    train_prec = precision_score(train_targets, train_preds)
    train_rec = recall_score(train_targets, train_preds)
    
    val_acc = accuracy_score(val_targets, val_preds)
    val_prec = precision_score(val_targets, val_preds)
    val_rec = recall_score(val_targets, val_preds)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {train_loss / len(train_loader):.4f}, "
          f"Train Acc: {train_acc:.4f}, "
          f"Train Prec: {train_prec:.4f}, "
          f"Train Rec: {train_rec:.4f}, "
          f"Val Loss: {val_loss / len(val_loader):.4f}, "
          f"Val Acc: {val_acc:.4f}, "
          f"Val Prec: {val_prec:.4f}, "
          f"Val Rec: {val_rec:.4f}")

# Save the trained model
torch.save(model.state_dict(), "genomic_imputation_model.pth")



Epoch [1/50], Train Loss: 0.6891, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6715, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [2/50], Train Loss: 0.6717, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6712, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [3/50], Train Loss: 0.6714, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6710, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [4/50], Train Loss: 0.6712, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6708, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [5/50], Train Loss: 0.6713, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6709, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [6/50], Train Loss: 0.6712, Train Acc: 0.3341, Train Prec: 0.3341, Train Rec: 1.0000, Val Loss: 0.6708, Val Acc: 0.3361, Val Prec: 0.3361, Val Rec: 1.0000
Epoch [7/50], Train Loss: 0.6711, 

KeyboardInterrupt: 

In [41]:
# Load the trained model and keep training it with a learning rate scheduler
# Initialize the model, loss function, and optimizer
model = TransformerModel(input_size, hidden_size, num_layers, num_heads, dropout)
model.load_state_dict(torch.load("genomic_imputation_model.pth"))
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_preds = []
    train_targets = []
    for batch_masked_data, batch_mask, batch_data in train_loader:
        optimizer.zero_grad()
        output = model(batch_masked_data)
        loss = criterion(output[batch_mask], batch_data[batch_mask])
        loss = loss.mean()  # Calculate average loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_preds.extend((output[batch_mask].sigmoid() > 0.5).cpu().numpy().flatten())
        train_targets.extend(batch_data[batch_mask].cpu().numpy().flatten())
    
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch_masked_data, batch_mask, batch_data in val_loader:
            output = model(batch_masked_data)
            loss = criterion(output[batch_mask], batch_data[batch_mask])
            loss = loss.mean()  # Calculate average loss
            val_loss += loss.item()
            val_preds.extend((output[batch_mask].sigmoid() > 0.5).cpu().numpy().flatten())
            val_targets.extend(batch_data[batch_mask].cpu().numpy().flatten())
    
    # Update the learning rate based on the validation loss
    scheduler.step(val_loss)
    
    train_acc = accuracy_score(train_targets, train_preds)
    train_prec = precision_score(train_targets, train_preds)
    train_rec = recall_score(train_targets, train_preds)
    
    val_acc = accuracy_score(val_targets, val_preds)
    val_prec = precision_score(val_targets, val_preds)
    val_rec = recall_score(val_targets, val_preds)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Train Loss: {train_loss / len(train_loader):.4f}, "
          f"Train Acc: {train_acc:.4f}, "
          f"Train Prec: {train_prec:.4f}, "
          f"Train Rec: {train_rec:.4f}, "
          f"Val Loss: {val_loss / len(val_loader):.4f}, "
          f"Val Acc: {val_acc:.4f}, "
          f"Val Prec: {val_prec:.4f}, "
          f"Val Rec: {val_rec:.4f}, "
          f"Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")

# Save the trained model
torch.save(model.state_dict(), "genomic_imputation_model.pth")



Epoch [1/50], Train Loss: 0.6727, Train Acc: 0.6952, Train Prec: 0.5314, Train Rec: 0.7418, Val Loss: 0.6688, Val Acc: 0.7024, Val Prec: 0.5415, Val Rec: 0.7465, Learning Rate: 0.001000
Epoch [2/50], Train Loss: 0.6667, Train Acc: 0.7033, Train Prec: 0.5394, Train Rec: 0.7643, Val Loss: 0.6682, Val Acc: 0.7140, Val Prec: 0.5581, Val Rec: 0.7159, Learning Rate: 0.001000
Epoch [3/50], Train Loss: 0.6622, Train Acc: 0.7054, Train Prec: 0.5421, Train Rec: 0.7615, Val Loss: 0.6698, Val Acc: 0.6893, Val Prec: 0.5259, Val Rec: 0.7654, Learning Rate: 0.001000
Epoch [4/50], Train Loss: 0.6572, Train Acc: 0.7002, Train Prec: 0.5357, Train Rec: 0.7683, Val Loss: 0.6705, Val Acc: 0.6843, Val Prec: 0.5205, Val Rec: 0.7704, Learning Rate: 0.001000
Epoch [5/50], Train Loss: 0.6524, Train Acc: 0.6914, Train Prec: 0.5255, Train Rec: 0.7845, Val Loss: 0.6701, Val Acc: 0.6963, Val Prec: 0.5345, Val Rec: 0.7461, Learning Rate: 0.001000
Epoch [6/50], Train Loss: 0.6480, Train Acc: 0.6878, Train Prec: 0.521

KeyboardInterrupt: 