In [2]:
import numpy
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.utils.data import random_split

## Part 1: Coding Questions (Python + PyTorch)

Write a Python function to create a PyTorch tensor of shape (5, 3) filled with random floats between 0 and 1.
Demonstrate how to perform element-wise addition, multiplication, and matrix multiplication between two tensors.


#### 1. Building and Training a Simple Model

Implement a simple feedforward neural network in PyTorch that could, in theory, be used for sequence classification (e.g., classifying protein sequences into functional categories).
Include a training loop with dummy data and explain each step of the process, including loss calculation and backpropagation.

In [55]:

class SimpleFeedForward(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_hidden, num_classes):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.f1 = nn.Linear(embed_dim, n_hidden)
        self.f2 = nn.Linear(n_hidden, num_classes)
    def forward(self, x):
        x=self.emb(x)
        x = self.f1(x)
        x = F.relu(x) #we choose ReLu because
        x = self.f2(x)
        return x

#How do you choose the correct number of hidden units?

In [73]:
#generate dummy data
vocab_size = 50000
embed_dim = 512
num_examples = 256
sequence_length = 6
n_hidden = 256
num_classes = 15

sff = SimpleFeedForward(vocab_size, embed_dim, n_hidden, num_classes)
print(sff)

torch.manual_seed(89)
features = torch.randint(0, vocab_size+1, (num_examples, sequence_length))

labels = torch.randint(0, num_classes, (num_examples,))

print(f'Features: {features.shape}')
print(f'Labels: {labels.shape}')

dataset = TensorDataset(features, labels)
train_loader = DataLoader(dataset, batch_size = 32, shuffle=True, num_workers = 8)

SimpleFeedForward(
  (emb): Embedding(50000, 512)
  (f1): Linear(in_features=512, out_features=256, bias=True)
  (f2): Linear(in_features=256, out_features=15, bias=True)
)
Features: torch.Size([256, 6])
Labels: torch.Size([256])


In [69]:
optimizer = torch.optim.SGD(sff.parameters(), lr= 0.001)
criterion = nn.CrossEntropyLoss()
training_loss=0
for i, batch in enumerate(train_loader):
    inputs, labels = batch
    optimizer.zero_grad()
    outputs = sff(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    training_loss+=loss.item()
    print(training_loss)




AttributeError: 'Tensor' object has no attribute 'logits'

Working with Sequential Data

Given a sequence of amino acids (represented as a one-hot encoded tensor), write a PyTorch module that uses an LSTM layer to process the sequence and output a classification.
Explain your choice of loss function and the significance of the LSTM's hidden state.
Implementing a Custom Loss Function

In the context of protein sequence generation, design a custom loss function in PyTorch that penalizes deviations from a target protein structure while encouraging novel therapeutics properties. Provide a high-level explanation of how you would implement and integrate this loss function into a training loop.
Diffusion Models for Protein Generation

Provide a brief overview of how you would implement a basic diffusion model in PyTorch for generating protein sequences. Discuss the key components of the model and how they work together to generate new sequences.

#### 2. Generate Dummy Data for the Next Token Prediction Task

In [102]:
vocab_size = 50000
batch_size = 32
num_examples = 1000
sequence_length = 20

data = torch.randint(0, vocab_size+1, (num_examples, sequence_length))
features = data[:, :-1]
targets = data[:, 1:]

print(f'Features: {features.shape}')
print(f'Targets: {targets.shape}')

dataset = TensorDataset(features, targets)
train, test = random_split(dataset, lengths = [0.7, 0.3])

print(f"Training Set: {len(train)}")
print(f"Testing Set: {len(test)}")

Features: torch.Size([1000, 19])
Targets: torch.Size([1000, 19])
Training Set: 700
Testing Set: 300


#### 3. Generate a Simple Feed Forward Neural Network for Binary Classification

In [117]:
class SFFBC(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.f1 = nn.Linear(input_dim, hidden_dim)
        self.f2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = self.f1(x)
        x = F.relu(x)
        x = self.f2(x) #no softmax because assuming Cross-Entropy Loss Later On
        return x

In [118]:
input_dim = 512
sffbc = SFFBC(input_dim, 256)
print(sffbc)

SFFBC(
  (f1): Linear(in_features=512, out_features=256, bias=True)
  (f2): Linear(in_features=256, out_features=2, bias=True)
)


4. #### Generate dummy data and train the simple net

In [113]:
num_examples = 1000
features = torch.randn(1000, 512)
labels = torch.randint(0, 2, (num_examples,))

print(f'Features: {features.shape}')
print(f'Labels: {labels.shape}')

Features: torch.Size([1000, 512])
Labels: torch.Size([1000])


In [115]:
train, test = random_split(TensorDataset(features, labels), lengths=[0.7,0.3])

train_loader = DataLoader(train, batch_size=32, shuffle=True, num_workers = 8)

In [127]:
from torch.optim import Adam
optimizer = Adam(sffbc.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()
training_loss = 0
epochs = 2
for epoch in range(epochs):
    sffbc.train()
    for i, batch in enumerate(train_loader):
        inputs, labels = batch
        outputs = sffbc(inputs)
        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        training_loss += loss.item()
        if (i % 5 == 0):
            print(f'Epoch: {epoch} Batch {i} Training Loss: {training_loss/5}')


Epoch: 0 Batch 0 Training Loss: 0.007162179797887802
Epoch: 0 Batch 5 Training Loss: 0.12291115000844002
Epoch: 0 Batch 10 Training Loss: 0.3158949481556192
Epoch: 0 Batch 15 Training Loss: 0.5706950673134997
Epoch: 0 Batch 20 Training Loss: 0.7860884377034381
Epoch: 1 Batch 0 Training Loss: 0.8025870588840917
Epoch: 1 Batch 5 Training Loss: 0.8381681521233986
Epoch: 1 Batch 10 Training Loss: 1.054530896693177
Epoch: 1 Batch 15 Training Loss: 1.5023427549560437
Epoch: 1 Batch 20 Training Loss: 1.8411152269502054


#### 3. Write the Equations for and Implement the Self Attention Module

In [168]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, d_out):
        super().__init__()
        self.W_v = nn.Linear(d_model, d_out)
        self.W_q = nn.Linear(d_model, d_out)
        self.W_k = nn.Linear(d_model, d_out)

    def forward(self, x):
        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v(x)
                               
        attention_scores = queries @ keys.transpose(-2,-1)
        attention_scores = attention_scores/self.d_model**0.5
        attention_weights = F.softmax(attention_scores, dim=-1)
        context_vector = attention_weights @ values

        return context_vector


#### 4. Create an Embedding Layer and LayerNorm Module and Apply the LayerNorm to the Embedding

In [129]:
batch_size = 16
emb_dim = 512
vocab_size = 50000
seq_len = 20
data = torch.randint(0, vocab_size, (batch_size, seq_len))

class Embeddings(nn.Module):
    def __init__(self,vocab_size, num_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, num_dim)
        self.num_dim = num_dim

    def forward(self, x):
        x = self.emb(x)**self.num_dim**0.5

        return x


e = Embeddings(vocab_size, emb_dim)
es = e(data)
print(f"Embeddings Shape: {es.shape}")

Embeddings Shape: torch.Size([16, 20, 512])


In [134]:
class LayerNorm(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        # standardizes across features for each data point
        #standarization operation (mean/stdev) batched with vectorization
        #[batch size, seq_length, d_model]
        #compute over dimension last dimension which is of size d_model
        mean = torch.mean(x, dim=-1, keepdim = True)
        stdev = torch.std(x, dim=-1, keepdim=True)
        x = (x-mean) / stdev
        return x

ln = LayerNorm()
esln = ln(es)

print(esln.shape)

torch.Size([16, 20, 512])


#### 5. Generate toy image data and tr

#### 6. Hyperparameter Tuning of a Small 

#### 7. Small Network Training on Next Token Prediction

In [11]:
vocab_size = 50000
num_examples = 1000
seq_len = 20
data = torch.randint(0, vocab_size, (num_examples, 20))

features = data[:, :-1]
targets = data[:, 1:]

print(f'Features: {features.shape}')
print(f'Features Example: {features[0]}')
print(f'Targets: {targets.shape}')

dataset = TensorDataset(features, targets)

train, test = random_split(dataset, lengths = [0.7, 0.3])

print(f'Train: {len(train)} Examples')
print(f'Test: {len(test)} Examples')

train_loader = DataLoader(train, batch_size = 32, shuffle=True, num_workers=8)


Features: torch.Size([1000, 19])
Features Example: tensor([43180,  5924, 21651, 43102, 18571,  5080, 46038, 36541, 20136, 40897,
        26704,  9526, 35570,  3553,   528, 27134, 22382, 37898,    83])
Targets: torch.Size([1000, 19])
Train: 700 Examples
Test: 300 Examples


In [12]:
class SelfAttention(nn.Module):
    def __init__(self,emb_dim):
        super().__init__()
        self.emb_dim = emb_dim
        self.W_q = nn.Linear(emb_dim, emb_dim)
        self.W_k = nn.Linear(emb_dim, emb_dim)
        self.W_v = nn.Linear(emb_dim, emb_dim)
    def forward(self, x):

        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v(x)

        attention_scores = queries @ keys.transpose(-2, -1)
        attention_weights = F.softmax(attention_scores/self.emb_dim**0.5, dim=-1)

        context_vectors = attention_weights @ values

        return context_vectors

In [26]:
class SequenceNN(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.attn = SelfAttention(emb_dim)
        self.ff = nn.Linear(emb_dim, vocab_size)


    def forward(self, x):
        x = self.emb(x)
        # print(x.shape)
        x = self.attn(x)
        # print(x.shape)
        x = self.ff(x)
        # print(x.shape)
        x = x.permute(0,2,1)
        # print(x.shape)
        #x = F.softmax(x, dim=-1)
        return x
    

In [27]:
emb_dim = 512
s = SequenceNN(vocab_size, emb_dim)

In [28]:
s(features)

tensor([[[-0.0882, -0.0053, -0.0194,  ...,  0.0202,  0.0127,  0.0017],
         [ 0.0556,  0.0846,  0.0866,  ...,  0.1088,  0.1312,  0.0834],
         [ 0.1488,  0.1766,  0.1257,  ...,  0.1424,  0.1886,  0.1634],
         ...,
         [-0.0023,  0.0177,  0.0160,  ...,  0.0238,  0.0289,  0.0237],
         [ 0.0548,  0.0844,  0.1051,  ...,  0.0895,  0.1299,  0.1313],
         [ 0.0423,  0.0093,  0.0357,  ...,  0.0288,  0.0231,  0.0302]],

        [[-0.0936, -0.0937, -0.0843,  ..., -0.1216, -0.0954, -0.0989],
         [-0.0511, -0.0493, -0.0370,  ..., -0.0549, -0.0090, -0.0567],
         [ 0.0091,  0.0444,  0.0353,  ...,  0.0014, -0.0042, -0.0239],
         ...,
         [ 0.1320,  0.1196,  0.1059,  ...,  0.1180,  0.0875,  0.1164],
         [ 0.0028,  0.0112, -0.0011,  ..., -0.0261,  0.0041, -0.0033],
         [-0.0055, -0.0247,  0.0156,  ..., -0.0053,  0.0030, -0.0215]],

        [[-0.1127, -0.1297, -0.1166,  ..., -0.1081, -0.1183, -0.1325],
         [-0.0587, -0.0460, -0.0248,  ..., -0

In [32]:
cel = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(s.parameters(), lr = 0.0001)
epochs = 4
for epoch in range(epochs):
    training_loss = 0
    for i, batch in enumerate(train_loader):
        inputs, labels = batch
        outputs = s(inputs)
        loss = cel(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        training_loss+=loss.item()
        
        # Report loss every 5 batches
        if (i + 1) % 5 == 0:
            print(f"Epoch {epoch + 1} Batch {i + 1} Batch Loss: {loss.item()}")

    average_loss = training_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs} Average Loss: {average_loss}")


Epoch 1 Batch 5 Batch Loss: 10.825770378112793
Epoch 1 Batch 10 Batch Loss: 10.82532787322998
Epoch 1 Batch 15 Batch Loss: 10.81826114654541
Epoch 1 Batch 20 Batch Loss: 10.818831443786621
Epoch 1/4 Average Loss: 10.823479088869961
Epoch 2 Batch 5 Batch Loss: 10.821355819702148
Epoch 2 Batch 10 Batch Loss: 10.82321548461914
Epoch 2 Batch 15 Batch Loss: 10.823677062988281
Epoch 2 Batch 20 Batch Loss: 10.826794624328613
Epoch 2/4 Average Loss: 10.823428240689365
Epoch 3 Batch 5 Batch Loss: 10.824016571044922
Epoch 3 Batch 10 Batch Loss: 10.822827339172363
Epoch 3 Batch 15 Batch Loss: 10.820866584777832
Epoch 3 Batch 20 Batch Loss: 10.828808784484863
Epoch 3/4 Average Loss: 10.823449264873158
Epoch 4 Batch 5 Batch Loss: 10.824984550476074
Epoch 4 Batch 10 Batch Loss: 10.81948184967041
Epoch 4 Batch 15 Batch Loss: 10.830227851867676
Epoch 4 Batch 20 Batch Loss: 10.824949264526367
Epoch 4/4 Average Loss: 10.82343647696755
