In [74]:
import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data import random_split

## Part 1: Coding Questions (Python + PyTorch)

Write a Python function to create a PyTorch tensor of shape (5, 3) filled with random floats between 0 and 1.
Demonstrate how to perform element-wise addition, multiplication, and matrix multiplication between two tensors.


#### 1. Building and Training a Simple Model

Implement a simple feedforward neural network in PyTorch that could, in theory, be used for sequence classification (e.g., classifying protein sequences into functional categories).
Include a training loop with dummy data and explain each step of the process, including loss calculation and backpropagation.

In [55]:

class SimpleFeedForward(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_hidden, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.f1 = nn.Linear(embed_dim, n_hidden)
        self.f2 = nn.Linear(n_hidden, num_classes)
    def forward(self, x):
        x=self.emb(x)
        x = self.f1(x)
        x = F.relu(x) #we choose ReLu because
        x = self.f2(x)
        return x

#How do you choose the correct number of hidden units?

In [73]:
#generate dummy data
vocab_size = 50000
embed_dim = 512
num_examples = 256
sequence_length = 6
n_hidden = 256
num_classes = 15

sff = SimpleFeedForward(vocab_size, embed_dim, n_hidden, num_classes)
print(sff)

torch.manual_seed(89)
features = torch.randint(0, vocab_size+1, (num_examples, sequence_length))

labels = torch.randint(0, num_classes, (num_examples,))

print(f'Features: {features.shape}')
print(f'Labels: {labels.shape}')

dataset = TensorDataset(features, labels)
train_loader = DataLoader(dataset, batch_size = 32, shuffle=True, num_workers = 8)

SimpleFeedForward(
  (emb): Embedding(50000, 512)
  (f1): Linear(in_features=512, out_features=256, bias=True)
  (f2): Linear(in_features=256, out_features=15, bias=True)
)
Features: torch.Size([256, 6])
Labels: torch.Size([256])


In [69]:
optimizer = torch.optim.SGD(sff.parameters(), lr= 0.001)
criterion = nn.CrossEntropyLoss()
training_loss=0
for i, batch in enumerate(train_loader):
    inputs, labels = batch
    optimizer.zero_grad()
    outputs = sff(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    training_loss+=loss.item()
    print(training_loss)




AttributeError: 'Tensor' object has no attribute 'logits'

Working with Sequential Data

Given a sequence of amino acids (represented as a one-hot encoded tensor), write a PyTorch module that uses an LSTM layer to process the sequence and output a classification.
Explain your choice of loss function and the significance of the LSTM's hidden state.
Implementing a Custom Loss Function

In the context of protein sequence generation, design a custom loss function in PyTorch that penalizes deviations from a target protein structure while encouraging novel therapeutics properties. Provide a high-level explanation of how you would implement and integrate this loss function into a training loop.
Diffusion Models for Protein Generation

Provide a brief overview of how you would implement a basic diffusion model in PyTorch for generating protein sequences. Discuss the key components of the model and how they work together to generate new sequences.

#### 2. Generate Dummy Data for the Next Token Prediction Task

In [102]:
vocab_size = 50000
batch_size = 32
num_examples = 1000
sequence_length = 20

data = torch.randint(0, vocab_size+1, (num_examples, sequence_length))
features = data[:, :-1]
targets = data[:, 1:]

print(f'Features: {features.shape}')
print(f'Targets: {targets.shape}')

dataset = TensorDataset(features, targets)
train, test = random_split(dataset, lengths = [0.7, 0.3])

print(f"Training Set: {len(train)}")
print(f"Testing Set: {len(test)}")

Features: torch.Size([1000, 19])
Targets: torch.Size([1000, 19])
Training Set: 700
Testing Set: 300


#### 3. Generate a Simple Feed Forward Neural Network for Binary Classification

In [117]:
class SFFBC(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.f1 = nn.Linear(input_dim, hidden_dim)
        self.f2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.f1(x)
        x = F.relu(x)
        x = self.f2(x) #no softmax because assuming Cross-Entropy Loss Later On
        return x

In [118]:
input_dim = 512
sffbc = SFFBC(input_dim, 256)
print(sffbc)

SFFBC(
  (f1): Linear(in_features=512, out_features=256, bias=True)
  (f2): Linear(in_features=256, out_features=2, bias=True)
)


4. #### Generate dummy data and train the simple net

In [113]:
num_examples = 1000
features = torch.randn(1000, 512)
labels = torch.randint(0, 2, (num_examples,))

print(f'Features: {features.shape}')
print(f'Labels: {labels.shape}')

Features: torch.Size([1000, 512])
Labels: torch.Size([1000])


In [115]:
train, test = random_split(TensorDataset(features, labels), lengths=[0.7,0.3])

train_loader = DataLoader(train, batch_size=32, shuffle=True, num_workers = 8)

In [127]:
from torch.optim import Adam
optimizer = Adam(sffbc.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()
training_loss = 0
epochs = 2
for epoch in range(epochs):
    sffbc.train()
    for i, batch in enumerate(train_loader):
        inputs, labels = batch
        outputs = sffbc(inputs)
        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        training_loss += loss.item()
        if (i % 5 == 0):
            print(f'Epoch: {epoch} Batch {i} Training Loss: {training_loss/5}')


Epoch: 0 Batch 0 Training Loss: 0.007162179797887802
Epoch: 0 Batch 5 Training Loss: 0.12291115000844002
Epoch: 0 Batch 10 Training Loss: 0.3158949481556192
Epoch: 0 Batch 15 Training Loss: 0.5706950673134997
Epoch: 0 Batch 20 Training Loss: 0.7860884377034381
Epoch: 1 Batch 0 Training Loss: 0.8025870588840917
Epoch: 1 Batch 5 Training Loss: 0.8381681521233986
Epoch: 1 Batch 10 Training Loss: 1.054530896693177
Epoch: 1 Batch 15 Training Loss: 1.5023427549560437
Epoch: 1 Batch 20 Training Loss: 1.8411152269502054
