# Autoencoder for cdr3

In [1]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch
from autoencoder_residual import ConvAutoEncoderRes
import numpy as np
import os
import sys
from IPython import get_ipython
from pathlib import Path

notebook_path = get_ipython().run_line_magic("pwd", "")
project_root = Path(notebook_path).parent.parent.parent
sys.path.append(str(project_root))
from modules.modules_kidera.gpu import GPU
use_gpu=True
device=GPU(use_gpu)
    
NUM_EPOCHS = 100
BATCH_SIZE = 64
LINEAR = 19  
LATENT_DIM = 64

### Train Autoencoder

In [2]:
data_train_cdr3=torch.load('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/data_train_test_cdr3.pt')

  data_train_cdr3=torch.load('../../../datasets/datasets_kidera/autoencoder_vdjdb_train/data_train_test_cdr3.pt')


In [5]:

# Division of data
X_train_cdr3, X_val_cdr3 = train_test_split(data_train_cdr3, test_size=0.3, random_state=42)

# DataLoaders

train_loader = DataLoader(TensorDataset(X_train_cdr3), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_cdr3), batch_size=BATCH_SIZE)

# Model
model = ConvAutoEncoderRes(linear=LINEAR, latent_dim=LATENT_DIM).to(device)

# Loss and Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

# Training
train_losses, val_losses = [], []

for epoch in range(NUM_EPOCHS):
    model.train()
    epoch_train_loss = 0.0

    for batch in train_loader:
        batch_x = batch[0].to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_x)
        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item() * batch_x.size(0)

    train_loss = epoch_train_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    epoch_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            val_x = batch[0].to(device)
            val_outputs = model(val_x)
            loss = criterion(val_outputs, val_x)
            epoch_val_loss += loss.item() * val_x.size(0)

    val_loss = epoch_val_loss / len(val_loader.dataset)
    val_losses.append(val_loss)
    if (epoch + 1) % 10 == 0:
        print(f"[{epoch + 1}/{NUM_EPOCHS}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


  data_train_cdr3=torch.load('../../datasets/autoencoder_vdjdb_train/data_train_test_cdr3.pt')


[10/100] Train Loss: 0.4781 | Val Loss: 0.4793
[20/100] Train Loss: 0.4725 | Val Loss: 0.4726
[30/100] Train Loss: 0.4716 | Val Loss: 0.4717
[40/100] Train Loss: 0.4715 | Val Loss: 0.4714
[50/100] Train Loss: 0.4707 | Val Loss: 0.4709
[60/100] Train Loss: 0.4710 | Val Loss: 0.4707
[70/100] Train Loss: 0.4705 | Val Loss: 0.4706
[80/100] Train Loss: 0.4704 | Val Loss: 0.4705
[90/100] Train Loss: 0.4705 | Val Loss: 0.4705
[100/100] Train Loss: 0.4705 | Val Loss: 0.4704


### Save model

In [6]:
torch.save(model.state_dict(), '/projects/tcr_nlp/conv_autoencoder/conv_res_block/cdr3_res.pth')

### Test Autoencoder

In [5]:
model = ConvAutoEncoderRes(linear=LINEAR,latent_dim=LATENT_DIM).to(device)
model.load_state_dict(torch.load('/projects/tcr_nlp/conv_autoencoder/conv_res_block/cdr3_res.pth'))
def get_encoded_cdr(cdr_enc, model, batch_size, device='cuda'):    
    """
    Pass epitope encodings through an autoencoder and return both encoded (latent) and decoded outputs.

    Args:
        epitope_enc (Tensor): Input tensor of encoded epitopes (e.g., physicochemical features).
        model (nn.Module): Trained autoencoder model.
        batch_size (int): Batch size for processing.
        device (str): Device for computation ('cuda' or 'cpu').

    Returns:
        Tuple[Tensor, Tensor]: 
            - Encoded latent representations (shape: [N, latent_dim])
            - Reconstructed epitopes (same shape as input)
    """
    model.eval()
    model.to(device)
    test_loader = DataLoader(TensorDataset(cdr_enc), batch_size=batch_size)
    encoded_cdr3, decoded_cdr3 = [], []

    with torch.no_grad():
        for batch in test_loader:
            x = batch[0].to(device)
            latent = model.linear_encode(model.encoder(x))
            decoded = model(x)
            encoded_cdr3.append(latent.cpu())
            decoded_cdr3.append(decoded.cpu())
    return torch.cat(encoded_cdr3), torch.cat(decoded_cdr3)

encoded_cdr3,decoded_cdr3 = get_encoded_cdr(data_train_cdr3,model,BATCH_SIZE)

  model.load_state_dict(torch.load('/projects/tcr_nlp/conv_autoencoder/conv_res_block/cdr3_res.pth'))


In [6]:
np.save('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/encoded_cdr3_res.npy', encoded_cdr3.numpy())
np.save('../../../dataset/datasets_kidera/check_quality/decoded_cdr3_res.npy', decoded_cdr3.numpy())