In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RepeatedKFold, KFold, StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def encode_seqCDR(seqCDR):
    encoding_list = []
    for i in range(len(seqCDR)):
        if seqCDR[i] == "*":
            encoding_list.append(np.zeros(5).reshape(1,5))
        else:
            encoding_list.append(af.loc[seqCDR[i]].values.reshape(1,5))
    return np.array(encoding_list).reshape(1,-1)

af = pd.read_csv("~/data/project/pMHC-TCR/library/Atchley_factors.csv")
af.index = af["Amino acid"]
af.drop(columns=["Amino acid"], inplace=True)
af

# torch dataset
class TCRDataset(Dataset):
    '''
    Use each two rows of data as a sample (one for alpha chain, one for beta chain), each sample has the common cellname
    Different from the chain, the HLA class is the same for each sample
    The aaSeqCDR1, aaSeqCDR2, aaSeqCDR3 are the CDR1, CDR2, CDR3 of the alpha chain and beta chain, respectively.
    '''
    def __init__(self, file_path):
        df = pd.read_csv(file_path)
        df["Neo"] = df["NeoAA"].str.slice(0,3) + "_" + df["NeoAA"].str.slice(-4,-1)
        df.drop(columns=["NeoAA"], inplace=True)
        for chain in ["AseqCDR", "BseqCDR"]:
            df[chain+"_1"] = df[chain].str.split("_").str[0]
            df[chain+"_2"] = df[chain].str.split("_").str[1]
            df[chain+"_3"] = df[chain].str.split("_").str[2]
            df.drop(columns=[chain], inplace=True)
        seq_list = ["AseqCDR_1", "AseqCDR_2", "AseqCDR_3", "BseqCDR_1", "BseqCDR_2", "BseqCDR_3"]
        # Find the longest sequence in each CDR
        len_map = df[seq_list].applymap(len).max()
        for column in seq_list:
            df[column] = df[column].str.ljust(len_map[column], "*")
        dataset = df[['AseqCDR_1', 'AseqCDR_2', 'AseqCDR_3', 'BseqCDR_1', 'BseqCDR_2', 'BseqCDR_3', 'Neo', 'HLA']]
        for seq in seq_list:
            encode_seq_result = list()
            for i in dataset[seq]:
                encode_seq_result.append(encode_seqCDR(i))
            col_name = seq + "_encode"
            dataset[col_name] = encode_seq_result
        X_feature = np.zeros((dataset.shape[0], 0))
        for seq in seq_list:
            col_name = seq + "_encode"
            col_feature = np.zeros((0, len_map[seq]*5))
            for i in range(dataset.shape[0]):
                col_feature = np.vstack((col_feature, dataset.loc[i, col_name].reshape(1,-1)))
            X_feature = np.hstack((X_feature, col_feature))

        # one-hot encode Neo
        X_neo = dataset["Neo"].values.reshape(-1,1)
        onehotEncoder = OneHotEncoder()
        X_neo = onehotEncoder.fit_transform(X_neo).toarray()

        # one-hot encode HLA
        labelencoder = LabelEncoder()
        y = labelencoder.fit_transform(dataset["HLA"].values.reshape(-1,1))

        # get the final feature matrix
        self.features = np.hstack((X_neo, X_feature))
        self.labels = y

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):        
        return self.features[idx], self.labels[idx]

In [7]:
file_path = '/home/wuxinchao/data/project/data/seqData/230215.csv'
TCRData = TCRDataset(file_path)

  y = column_or_1d(y, warn=True)


In [3]:
class pMHC_TCR_model(nn.Module):
    def __init__(self, input_size, batch_size=32 ,hidden_size=5, num_layers=2, device="cpu") -> None:
        super().__init__()
        self.input_size = input_size
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.label = nn.Linear(hidden_size, 1)

    def forward(self, input):
        # input: L, input_size
        if self.batch_size==1:
            x = input.float()
            h_0 = Variable(torch.zeros(self.num_layers, self.hidden_size).to(self.device))
            c_0 = Variable(torch.zeros(self.num_layers, self.hidden_size).to(self.device))
            out, (hn, cn) = self.lstm(x, (h_0, c_0))
            pred = self.label(out[-1, :])
        else:
            x = x.view(-1, self.batch_size, self.seq_length).float()
            # h_0: (num_layers * num_directions, batch, hidden_size)
            h_0 = Variable(torch.zeros(self.num_layers * 1, self.batch_size, self.hidden_size).to(self.device))
            c_0 = Variable(torch.zeros(self.num_layers * 1, self.batch_size, self.hidden_size).to(self.device))
            out, (hn, cn) = self.lstm(x, (h_0, c_0))
            pred = self.label(out[-1, :, :])
        return pred

In [35]:
def train(fold, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device).float()
        # data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        output = output.view(-1)
        # print(output, target)
        loss = nn.CrossEntropyLoss()(output, target)
        # loss = F.nll_loss(output, target)
        # loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Fold/Epoch: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                fold, epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            
def test(fold, model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device).float()
            # data, target = data.to(device), target.to(device)
            output = model(data)
            output = output.view(-1)
            # using loss function to calculate the loss
            
            test_loss += nn.CrossEntropyLoss()(output, target).item()  # sum up batch loss
            # test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            # print(output, target)
            # pred = output.argmax(dim=1, keepdim=True)
            # test_loss += F.cross_entropy(output, target)
            pred = output.sigmoid().round()  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('Test set for fold {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\
    '.format(fold, test_loss, correct, len(test_loader.dataset),  100. * correct / len(test_loader.dataset)))

In [36]:
batch_size = 1
seq_length = 2
folds = 5
repeats = 12
epochs = 100
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(f"Using {device} for training the network")

def reset_weights(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        m.reset_parameters()

model = pMHC_TCR_model(input_size=356, batch_size=batch_size, device=device).to(device)

# kf = RepeatedKFold(n_splits=10, n_repeats=12, random_state=1234)
# kf = KFold(n_splits=10, random_state=1234, shuffle=True)
kf = StratifiedKFold(n_splits=folds, random_state=1234, shuffle=True)
weights = torch.FloatTensor([1, 9])
optimizer = optim.Adadelta(model.parameters())

for fold, (train_idx, test_idx) in enumerate(kf.split(TCRData.features, TCRData.labels)):
    print(f"-------------------Fold {fold}-------------------")
        if batch_size == 1:
        # using the subsampler to get the data
        train_loader = torch.utils.data.DataLoader(TCRData, batch_size=len(), sampler=train_subsampler)

    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)
    # train_subsampler = WeightedRandomSampler(weights, len(train_idx), replacement=True)
    # test_subsampler = WeightedRandomSampler(weights, len(test_idx), replacement=True)
    train_loader = torch.utils.data.DataLoader(TCRData, batch_size=batch_size, sampler=train_subsampler)
    test_loader = torch.utils.data.DataLoader(TCRData, batch_size=batch_size, sampler=test_subsampler)
    model.apply(reset_weights)

    for epoch in range(1, epochs + 1):
        train(fold, model, device, train_loader, optimizer, epoch)
        test(fold, model, device, test_loader)

Using cpu for training the network
-------------------Fold 0-------------------
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)    
Test set for fold 0: Average loss: 0.0000, Accuracy: 162/886 (18%)  

KeyboardInterrupt: 

In [45]:
# F.nll_loss(torch.tensor([-0.1]), torch.tensor([1]))
# torch.tensor([-22]).sigmoid()
nn.CrossEntropyLoss()(torch.tensor([-1.]), torch.tensor([1.]))

tensor(-0.)

In [None]:
# write a function to get the highest probability of the two classes

def find_max_prob(output):
    # output: batch_size, 2
    output = output.view(-1)
    output = output.sigmoid()
    return output.max()