In [None]:
import pandas as pd

df = pd.read_csv("/content/uniprotkb_length_TO_5000_AND_reviewed_t_2024_09_20 (1).tsv")

df.values
seqs = [str(v[0]).split('\t')[1] for v in df.values[1:]]
seqs

print(len(seqs), "#seqs")
print(sum([len(s) for s in seqs]), "tokens")
print(max([len(s) for s in seqs]))

20398 #seqs
11151556 tokens
4981


In [None]:
import numpy as np

AMINO_ACIDS = "ACDEFGHIKLMNPQRSTVWY"
AA_TO_INDEX = {aa: idx for idx, aa in enumerate(AMINO_ACIDS)}

def one_hot_encode(sequence):
    """One-hot encode a protein sequence."""
    one_hot = np.zeros((5000, len(AMINO_ACIDS)))
    for i, aa in enumerate(sequence):
        if aa in AA_TO_INDEX:
            one_hot[i, AA_TO_INDEX[aa]] = 1
    return one_hot

def one_hot_decode(one_hot_seq):
    """Decode a one-hot encoded protein sequence."""
    indices = np.argmax(one_hot_seq, axis=1)
    sequence = "".join([AMINO_ACIDS[idx] for idx in indices])
    return sequence

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

# Define the Dataset class for one-hot encoded protein sequences
class ProteinSequenceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences  # A list of one-hot encoded sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Return the one-hot encoded sequence
        sequence = self.sequences[idx]
        return torch.Tensor(sequence)  # Flatten to a 1D tensor

# Convert protein sequences to one-hot encoded sequences
Dataset = [one_hot_encode(s) for s in seqs]

# Create a Dataset object
protein_dataset = ProteinSequenceDataset(Dataset)

# Split the dataset into training and testing sets (80% train, 20% test)
train_size = int(0.8 * len(protein_dataset))
test_size = len(protein_dataset) - train_size
train_dataset, test_dataset = random_split(protein_dataset, [train_size, test_size])

# Create DataLoaders for training and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Autoencoder model
class SequenceAutoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super(SequenceAutoencoder, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, latent_size)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size)
        )

    def forward(self, x):
        # Compress the sequence
        encoded = self.encoder(x)
        # Reconstruct the sequence
        decoded = self.decoder(encoded)
        return decoded

# Create the model, loss function and optimizer
input_size = 20  # Length of the input sequence
hidden_size = 1000
latent_size = 2000  # Compressed vector size

model = SequenceAutoencoder(input_size, hidden_size, latent_size)

criterion = nn.MSELoss()  # Mean squared error loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    cur_loss = 0
    for data in train_loader:
        # Forward pass: compute the reconstructed sequences
        reconstructed = model(data)

        # Compute the loss
        cur_loss += criterion(reconstructed, data)

        # Backpropagation and optimization
    optimizer.zero_grad()
    cur_loss.backward()
    optimizer.step()

    # Print the loss
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {cur_loss.item()/len(train_loader):.4f}')

print("Training completed!")

# # Example of encoding and decoding a sequence
# with torch.no_grad():
#     example_sequence = torch.randn(1, input_size)  # Generate a random input sequence
#     encoded_sequence = model.encoder(example_sequence)
#     decoded_sequence = model.decoder(encoded_sequence)

#     print("Original Sequence:", example_sequence)
#     print("Encoded Vector:", encoded_sequence)
#     print("Reconstructed Sequence:", decoded_sequence)


torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
torch.Size([2, 5000, 20])
