## Introduction

Deep learning models have become essential tools for learning complex patterns in biological sequences. Unlike linear or traditional nonlinear models, deep architectures can directly learn hierarchical features from raw sequences, capturing both local motifs and long-range dependencies.
In this notebook, we explore two major deep learning families widely used in computational biology: CNN and Transformer.

In [1]:
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
CAPSD = pd.read_csv("data_fitness/CAPSD_AAV2S_Sinai_2021.csv")
PHOT = pd.read_csv("data_fitness/PHOT_CHLRE_Chen_2023.csv")
POLG = pd.read_csv("data_fitness/POLG_DEN26_Suphatrakul_2023.csv")

We first process the data from CAPSD dataset, doing one-hot encoding for the amino acid sequences and then the train-test split. We take the first 5000 entries to keep runtime manageable.

mutated_sequence contains amino acid sequences.

DMS_score is the experimentally measured fitness.

In [3]:
sequences = CAPSD["mutated_sequence"].values[:5000]
scores = CAPSD["DMS_score"].values[:5000]

In [4]:
amino_acids = list("ACDEFGHIKLMNPQRSTVWY") 
encoder = OneHotEncoder(categories=[amino_acids] * len(sequences[0]))
seq_list = [list(seq) for seq in sequences]
X = encoder.fit_transform(seq_list)
X.shape

(5000, 14700)

We use an 80/20 train–test split. Since fitness values vary in scale, we apply StandardScaler to the labels.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train_noscale, y_test_noscale = train_test_split(
    X, scores, test_size=0.2, random_state=42)

y_scaler = StandardScaler()
y_train= y_scaler.fit_transform(y_train_noscale.reshape(-1, 1)).ravel()

y_test = y_scaler.transform(y_test_noscale.reshape(-1, 1)).ravel()
X_train.shape, X_test.shape

((4000, 14700), (1000, 14700))

We then define the helper function to print different metrics for evaluating machine learning models on the held-out test set:

-R2 score (coefficient of determination)

-Mean Squared Error (MSE)

-Mean Absolute Error (MAE)

Higher R2 score, lower MSE and MAE indicate a better machine learning model.

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"--- {name} ---")
    print("R2:", r2_score(y_true, y_pred))
    print("MSE:", mean_squared_error(y_true, y_pred))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print()

Deep learning models can be built with the PyTorch package:

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [8]:
if not isinstance(X_train, np.ndarray):
    X_train = X_train.toarray()

if not isinstance(X_test, np.ndarray):
    X_test = X_test.toarray()

In [None]:
feature_dim = 20
seq_len = X_train.shape[1] // feature_dim   


assert X_train.shape[1] == seq_len * feature_dim, \
    "Input dimension is not divisible by 20. Check one-hot encoding!"



class SequenceDataset(Dataset):
    def __init__(self, X, y, seq_len, feature_dim=20):
        """
        X: numpy array (N, seq_len * feature_dim)
        y: numpy array (N,)
        """
        # reshape into (N, seq_len, 20)
        self.X = X.reshape(-1, seq_len, feature_dim)
        self.y = y.astype(float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx], dtype=torch.float32),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )


train_ds = SequenceDataset(X_train, y_train, seq_len)
test_ds  = SequenceDataset(X_test,  y_test,  seq_len)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)


In [10]:
import numpy as np

def evaluate_model_torch(name, model, loader):
    model.eval()
    preds = []
    trues = []
    
    with torch.no_grad():
        for Xb, yb in loader:
            Xb = Xb.to(device)
            yb = yb.to(device)
            out = model(Xb).cpu().numpy()
            preds.append(out)
            trues.append(yb.cpu().numpy())

    preds = np.concatenate(preds)
    trues = np.concatenate(trues)
    evaluate_model(name, trues, preds)


### 1. CNN model

In [11]:
class CNNModel(nn.Module):
    def __init__(self, seq_len, feature_dim=20):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(feature_dim, 64, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),   # seq_len → seq_len/2
            nn.Conv1d(64, 128, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),   # seq_len/2 → seq_len/4
        )

        # ---- Compute flattened dimension automatically ----
        with torch.no_grad():
            dummy = torch.zeros(1, feature_dim, seq_len)  # (batch, channels, length)
            conv_out = self.conv(dummy)
            self.flat_dim = conv_out.numel()

        self.fc = nn.Sequential(
            nn.Linear(self.flat_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        # x: (batch, seq, features) → CNN needs (batch, channels, seq)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x.squeeze()

cnn = CNNModel(seq_len).to(device)

In [12]:
def train_model(model, loader, lr=1e-3, epochs=20):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        total_loss = 0
        for Xb, yb in loader:
            Xb = Xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            pred = model(Xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")


In [13]:
train_model(cnn, train_loader, epochs=20)
evaluate_model_torch("CNN", cnn, test_loader)


Epoch 1 | Loss: 65.9356
Epoch 2 | Loss: 58.6360
Epoch 3 | Loss: 42.7081
Epoch 4 | Loss: 29.8048
Epoch 5 | Loss: 22.5924
Epoch 6 | Loss: 18.6759
Epoch 7 | Loss: 16.5281
Epoch 8 | Loss: 15.1806
Epoch 9 | Loss: 13.8016
Epoch 10 | Loss: 13.0996
Epoch 11 | Loss: 12.5772
Epoch 12 | Loss: 11.4877
Epoch 13 | Loss: 11.0906
Epoch 14 | Loss: 10.8290
Epoch 15 | Loss: 9.8271
Epoch 16 | Loss: 9.8749
Epoch 17 | Loss: 9.1654
Epoch 18 | Loss: 9.2720
Epoch 19 | Loss: 8.8103
Epoch 20 | Loss: 8.7256
--- CNN ---
R2: 0.8094356262246418
MSE: 0.19345362
MAE: 0.3309806



### 2. Transformer

In [None]:
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


class SmallDataConvTransformer(nn.Module):
    def __init__(
        self,
        feature_dim=20,      
        d_model=64,          
        conv_channels=32,    
        kernel_size=5,
        nhead=4,            
        num_layers=2,        
        dropout=0.2
    ):
        super().__init__()


        self.conv = nn.Conv1d(
            in_channels=feature_dim,
            out_channels=conv_channels,
            kernel_size=kernel_size,
            padding=kernel_size // 2
        )
        self.conv_act = nn.ReLU()

        self.proj = nn.Linear(conv_channels, d_model)


        self.pos = PositionalEncoding(d_model)


        layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 2, 
            dropout=dropout,
            batch_first=True,
            norm_first=True               
        )
        self.encoder = nn.TransformerEncoder(layer, num_layers=num_layers)

        self.cls = nn.Parameter(torch.randn(1, 1, d_model))


        self.fc = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(dropout),
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1)
        )

    def forward(self, x):
        B, L, _ = x.shape

        x = x.transpose(1, 2)     
        x = self.conv_act(self.conv(x))   
        x = x.transpose(1, 2)   

        x = self.proj(x)         
        cls = self.cls.expand(B, 1, -1)
        x = torch.cat([cls, x], dim=1)   

        x = self.pos(x)
        x = self.encoder(x)

        cls_out = x[:, 0]
        return self.fc(cls_out).squeeze(-1)
transformer = SmallDataConvTransformer().to(device)


In [19]:
def train_model(model, loader, lr=5e-5, epochs=20):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        total_loss = 0
        for Xb, yb in loader:
            Xb = Xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            pred = model(Xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

In [20]:
train_model(transformer, train_loader, epochs=20)
evaluate_model_torch("Transformer", transformer, test_loader)

Epoch 1 | Loss: 62.9447
Epoch 2 | Loss: 63.0223
Epoch 3 | Loss: 62.9669
Epoch 4 | Loss: 62.9909
Epoch 5 | Loss: 63.0072
Epoch 6 | Loss: 62.9863
Epoch 7 | Loss: 63.0783
Epoch 8 | Loss: 63.1228
Epoch 9 | Loss: 62.9415
Epoch 10 | Loss: 63.0506
Epoch 11 | Loss: 62.9417
Epoch 12 | Loss: 62.9909
Epoch 13 | Loss: 63.0131
Epoch 14 | Loss: 63.1710
Epoch 15 | Loss: 62.9182
Epoch 16 | Loss: 62.8425
Epoch 17 | Loss: 63.0042
Epoch 18 | Loss: 63.0392
Epoch 19 | Loss: 62.8961
Epoch 20 | Loss: 62.9396
--- Transformer ---
R2: -0.004772354686790381
MSE: 1.0200063
MAE: 0.8963697



We can see that CNN performance is good because the convolutional strucutre can capture the motifs in the amino acid sequences; but Transformer is not working, probably because of the small size of the data. Deep learning models are great, but not always useful, depending on the structure of the biological data.