In [1]:
import random
import tqdm
import logging
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold, StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def encode_seqCDR(seqCDR):
    encoding_list = []
    for i in range(len(seqCDR)):
        if seqCDR[i] == "*":
            encoding_list.append(np.zeros(5).reshape(1,5))
        else:
            encoding_list.append(af.loc[seqCDR[i]].values.reshape(1,5))
    return np.array(encoding_list).reshape(1,-1)

af = pd.read_csv("~/data/project/pMHC-TCR/library/Atchley_factors.csv")
af.index = af["Amino acid"]
af.drop(columns=["Amino acid"], inplace=True)

In [None]:
class TCRDataset(Dataset):
    def __init__(self, file_path):
        df = pd.read_csv(file_path)
        df_ng = df.copy()
        df_ng = df_ng[df_ng["HLA"] != "-"]
        df_ng["Class"] = "negative"
        df_ng["AseqCDR_3"] = df_ng["AseqCDR_3"].apply(
            lambda x: random.choice(list(set(df["AseqCDR_3"]) - set(x))))
        df_ng["BseqCDR_3"] = df_ng["BseqCDR_3"].apply(
            lambda x: random.choice(list(set(df["BseqCDR_3"]) - set(x))))
        df_pos = df[df["Class"] == "positive"]
        df = pd.concat([df_pos, df_ng], axis=0)
        df = df["HLA", "Neo", "AseqCDR_3", "BseqCDR_3", "Class"]
        seq_list = ["AseqCDR_3", "BseqCDR_3"]
        len_map = df[seq_list].applymap(len).max()
        X_feature = np.zeros((len(df), 0))
        for column in seq_list:
            df[column] = df[column].str.ljust(len_map[column], "*")
            encode_seq_result = list()
            for i in df[column]:
                encode_seq_result.append(encode_seqCDR(i))
            col_name = column + "_encode"
            df[col_name] = encode_seq_result
            col_feature = np.zeros((0, len_map[column]*5))
            for i in range(len(df)):
                col_feature = np.vstack((col_feature, df.loc[i, col_name].reshape(1, -1)))
            X_feature = np.hstack((X_feature, col_feature))
    


In [None]:
# not use
# My may need to find a proper way to encode the HLA aa sequence, because there are 10 to 20 different HLA types and some of them have variants which are just one single aa difference.
hla_list = list(set(df["HLA"]))
hla_list.sort()
hla_dict = dict()
for i in range(len(hla_list)):
    hla_dict[hla_list[i]] = i
# The encoding could apply autoencoder to encode the HLA sequence.
df["HLA_encode"] = df["HLA"].map(hla_dict)
X_feature = np.hstack((X_feature, df["HLA_encode"].values.reshape(-1,1)))

In [3]:
class HLAAutoEncoder_twoLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(HLAAutoEncoder_twoLayer, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim*4),
            nn.ReLU(True),
            nn.Linear(hidden_dim*4, hidden_dim*2),
            nn.ReLU(True),
            nn.Linear(hidden_dim*2, hidden_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*2),
            nn.ReLU(True),
            nn.Linear(hidden_dim*2, hidden_dim*4),
            nn.ReLU(True),
            nn.Linear(hidden_dim*4, input_dim),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
model = HLAAutoEncoder_twoLayer(input_dim=5*len(df["aaSeqHLA"].unique().max()), hidden_dim=5)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-8)

In [None]:
epochs = 20
output = []
losses = []
for epoch in range(epochs):
    for idx, (data) in enumerate(train_loader):
        data = Variable(data).float()
        optimizer.zero_grad()

In [None]:
# put the sequence are similar, (the length of the sequence are different less than 5 aa) into a batch, and then use the autoencoder to encode the HLA sequence.

class LenMatchBatchSampler(data.BatchSampler):
    def __iter__(self):
        buckets = [[] for i in range(300)]
        yielded = 0

        for idx in self.sampler:
            count_zeros = int(torch.sum(self.sampler.data_source[idx] == 0) / 5)
            buckets[count_zeros].append(idx)

            if len(buckets[count_zeros]) == self.batch_size:
                batch = list(buckets[count_zeros])
                yield batch
                buckets[count_zeros] = []

        batch = []
        leftover = [idx for bucket in buckets for idx in bucket]

        for idx in leftover:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yield batch
                batch = []

        if len(batch) > 0 and not self.drop_last:
            yield batch
            