In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch

def generate_synthetic_data(data_size, sequence_length=50, nucleotides=['A', 'T', 'C', 'G']):
    data = []
    
    for _ in range(data_size):        
        sequence = torch.randint(len(nucleotides), (sequence_length,))
        data.append(sequence)
    
    return data


# Toy LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x, (h_n, _) = self.lstm(x)
        out = self.fc(x)
        return out
    

# Define your DNA dataset class
class DNADataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sequence = self.data[index]
        targets = sequence[-1].clone()
        inputs = sequence[:-1].clone()
        return inputs.float(), targets.long()

# Define your DNA model
class DNAModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DNAModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        out = self.fc(x)
        return out

# Define hyperparameters
input_size = 4  # Number of nucleotide types (A, T, C, G)
hidden_size = 128
output_size = 4  # Number of classes or prediction targets
learning_rate = 0.1
batch_size = 32
num_epochs = 10
test_size = 0.2
random_state = 42
data_size = 1000

# Generate synthetic DNA sequences
data = generate_synthetic_data(data_size)

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=test_size, random_state=random_state)

# Create data loaders
train_dataset = DNADataset(train_data)
val_dataset = DNADataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize the model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Forward pass
    for inputs, targets in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        # print(loss.item(), outputs.shape, targets.shape)
        # # Backward pass and optimization
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        
        # print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# # Training loop
# for epoch in range(num_epochs):
#     model.train()  # Set the model to training mode
#     train_loss = 0.0
    
#     for inputs, targets in train_loader:
        
#         # Forward pass
#         outputs = model(inputs)


        
#         # Compute loss
#         loss = criterion(outputs, targets)
        
#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         train_loss += loss.item()
    
#     # Compute average training loss for the epoch
#     avg_train_loss = train_loss / len(train_loader)
    
#     # Validation loop
#     model.eval()  # Set the model to evaluation mode
#     val_loss = 0.0
#     val_predictions = []
#     val_targets = []
    
#     with torch.no_grad():
#         for inputs,targets in val_loader:
#             outputs = model(inputs)
            
#             # Compute loss
#             loss = criterion(outputs, targets)
#             val_loss += loss.item()
            
#             # Collect predictions and targets for evaluation
#             _, predicted = torch.max(outputs.data, 1)
#             val_predictions.extend(predicted.tolist())
#             val_targets.extend(targets.tolist())
    
#     # Compute average validation loss and accuracy for the epoch
#     avg_val_loss = val_loss / len(val_loader)
#     accuracy = accuracy_score(val_targets, val_predictions)
    
#     # Print training and validation metrics for the epoch
#     print(f"Epoch [{epoch+1}/{num_epochs}]: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, Accuracy = {accuracy:.4f}")

# Save the trained model
# torch.save(model.state_dict(), "dna_model.pth")


: 

: 

In [15]:
model.lstm(torch.randint(0,4,size=(32,100)).float())[0]

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<SqueezeBackward1>)

In [16]:
import torch
import torch.nn as nn

# Toy LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x, (h_n, _) = self.lstm(x)
        out = self.fc(x)
        return out

# Synthetic data
train_data = torch.tensor([[0, 1, 2, 3], [2, 3, 0, 1], [1, 0, 3, 2], [3, 2, 1, 0]]).float()
train_labels = torch.tensor([0, 1, 1, 0]).long()

# Hyperparameters
input_size = 4
hidden_size = 8
output_size = 2
learning_rate = 0.1
num_epochs = 100

# Initialize the model
model = LSTMModel(input_size, hidden_size, output_size)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(train_data)
    loss = criterion(outputs, train_labels)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Print training loss
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# Testing
test_data = torch.tensor([[0, 1, 2, 3], [2, 3, 0, 1]]).float()
with torch.no_grad():
    model.eval()
    test_outputs = model(test_data)
    _, predicted = torch.max(test_outputs.data, 1)

print("Predictions:", predicted.tolist())


Epoch [10/100], Loss: 0.0573
Epoch [20/100], Loss: 0.0011
Epoch [30/100], Loss: 0.0002
Epoch [40/100], Loss: 0.0001
Epoch [50/100], Loss: 0.0001
Epoch [60/100], Loss: 0.0001
Epoch [70/100], Loss: 0.0001
Epoch [80/100], Loss: 0.0001
Epoch [90/100], Loss: 0.0001
Epoch [100/100], Loss: 0.0001
Predictions: [0, 1]


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the LSTM model
class DNAPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DNAPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        output = self.fc(lstm_out)
        return output

batch_size = 32
L = 100
all_nucleotides = ['A', 'C', 'G', 'T']
num_classes = len(all_nucleotides)
sequences = torch.randint(0,num_classes,size=(batch_size,L))
sequence_tensors = torch.functional.F.one_hot(sequences, num_classes=num_classes).float()
print(sequence_tensors.shape)
inputs = sequence_tensors[:, :-1, :]
targets = sequences[:, 1:]

print(inputs.shape, targets.shape)


# Define hyperparameters

hidden_size = 128
learning_rate = 0.01
num_epochs = 100
input_size = len(all_nucleotides)
output_size = len(all_nucleotides)

# Initialize the model
model = DNAPredictor(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    output = model(inputs)
    
    # Compute the loss
    # print(output.shape, targets.shape)
    loss = criterion(output.view(-1,output.shape[-1]), targets.flatten())
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    # Print the loss at every 10th epoch
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# # Generate a DNA sequence by predicting the next nucleotide
# model.eval()

# # Assume you have a test DNA sequence
# test_sequence = "CGTAGTCG"

# # Convert the test sequence to a tensor
# test_tensor = torch.zeros(len(test_sequence), len(all_nucleotides))
# for i, nucleotide in enumerate(test_sequence):
#     test_tensor[i, nucleotide_to_index[nucleotide]] = 1

# # Forward pass and prediction
# predicted_output = model(test_tensor.unsqueeze(0))
# _, predicted_index = torch.max(predicted_output, dim=1)
# predicted_nucleotide = all_nucleotides[predicted_index.item()]

# print(f"Test sequence: {test_sequence}")
# print(f"Predicted next nucleotide: {predicted_nucleotide}")


torch.Size([32, 100, 4])
torch.Size([32, 99, 4]) torch.Size([32, 99])
Epoch 10/100, Loss: 1.383506178855896
Epoch 20/100, Loss: 1.3804831504821777
Epoch 30/100, Loss: 1.373329758644104
Epoch 40/100, Loss: 1.366819143295288
Epoch 50/100, Loss: 1.343001365661621
Epoch 60/100, Loss: 1.307288408279419
Epoch 70/100, Loss: 1.2602723836898804
Epoch 80/100, Loss: 1.1968345642089844
Epoch 90/100, Loss: 1.096571683883667
Epoch 100/100, Loss: 0.9589883685112


In [36]:
output, _ = model.lstm(input_tensor)
input_tensor.shape, target_tensor.shape, output.shape

(torch.Size([3, 7, 4]), torch.Size([3]), torch.Size([3, 7, 128]))