# Autoencoder for cdr3

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn
import torch
from autoencoder import ConvAutoEncoder
import numpy as np
import os
import sys
from IPython import get_ipython
from pathlib import Path


notebook_path = get_ipython().run_line_magic("pwd", "")
project_root = Path(notebook_path).parent.parent.parent
sys.path.append(str(project_root)) 
use_gpu=True
if use_gpu and torch.cuda.is_available():
    device = torch.device("cuda:0")
elif use_gpu and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
num_epochs = 100

batch_size = 64
linear = 19  
latent_dim = 64
num_epochs = 100

In [7]:
data_train_cdr3=torch.load('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/data_train_test_cdr3.pt')

  data_train_cdr3=torch.load('../../../datasets/datasets_kidera/autoencoder_vdjdb_train/data_train_test_cdr3.pt')


### Train Autoencoder

In [None]:
# Division of data
X_train_cdr3, X_val_cdr3 = train_test_split(data_train_cdr3, test_size=0.3, random_state=42)

# DataLoaders

train_loader = DataLoader(TensorDataset(X_train_cdr3), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_cdr3), batch_size=batch_size)

# Model
model = ConvAutoEncoder(linear=linear, latent_dim=latent_dim).to(device)

# Loss and Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

# Training
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    epoch_train_loss = 0.0

    for batch in train_loader:
        batch_x = batch[0].to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_x)
        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item() * batch_x.size(0)

    train_loss = epoch_train_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    epoch_val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            val_x = batch[0].to(device)
            val_outputs = model(val_x)
            loss = criterion(val_outputs, val_x)
            epoch_val_loss += loss.item() * val_x.size(0)

    val_loss = epoch_val_loss / len(val_loader.dataset)
    val_losses.append(val_loss)
    if (epoch + 1) % 10 == 0:
        print(f"[{epoch + 1}/{num_epochs}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


### Save model

In [None]:
torch.save(model.state_dict(), '/projects/tcr_nlp/conv_autoencoder/conv/cdr3.pth')

### Test Autoencoder

In [4]:
model = ConvAutoEncoder(linear=19,latent_dim=64).to(device)
model.load_state_dict(torch.load('/projects/tcr_nlp/conv_autoencoder/conv/cdr3.pth'))
def get_encoded_cdr(cdr_enc, model, batch_size, device='cuda'):      
    """
    Pass cdr3 encodings through an autoencoder and return both encoded (latent) and decoded outputs.

    Args:
        epitope_enc (Tensor): Input tensor of encoded epitopes (e.g., physicochemical features).
        model (nn.Module): Trained autoencoder model.
        batch_size (int): Batch size for processing.
        device (str): Device for computation ('cuda' or 'cpu').

    Returns:
        Tuple[Tensor, Tensor]: 
            - Encoded latent representations (shape: [N, latent_dim])
            - Reconstructed epitopes (same shape as input)
    """
    model.eval()
    model.to(device)
    test_loader = DataLoader(TensorDataset(cdr_enc), batch_size=batch_size)
    encoded_cdr3, decoded_cdr3 = [], []

    with torch.no_grad():
        for batch in test_loader:
            x = batch[0].to(device)
            latent = model.linear_encode(model.encoder(x))
            decoded = model(x)
            encoded_cdr3.append(latent.cpu())
            decoded_cdr3.append(decoded.cpu())
    return torch.cat(encoded_cdr3), torch.cat(decoded_cdr3)

encoded_cdr3,decoded_cdr3 = get_encoded_cdr(data_train_cdr3,model,batch_size)


  model.load_state_dict(torch.load('/projects/tcr_nlp/conv_autoencoder/conv/cdr3.pth'))


In [5]:
np.save('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/encoded_cdr3.npy', encoded_cdr3.numpy())
np.save('../../../dataset/datasets_kidera/check_quality/decoded_cdr3.npy', decoded_cdr3.numpy())