In [97]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preliminaries

In [98]:
np.random.seed(0)
random.seed(0)

In [99]:
import torch
from torch import nn, Tensor
from torch import optim
from torch.utils import data
import wandb
#wandb.init(project="HIV_kaggle")

Parse and look at first 5 rows

In [100]:
train_data = pd.read_csv('/kaggle/input/hivprogression/training_data.csv')
train_data.head()

In [101]:
labels = torch.tensor(train_data["Resp"].values, dtype=torch.float)
n_labels = labels.shape[0]
n_labels

In [102]:
n_train = train_data.shape[0]
n_train

We have to remove the first two columns

In [103]:

all_features = train_data.iloc[:, 2:]
# one can assume if Seqs are not present it is a bad sign for survival
#all_features["PR SeqNan"] = all_features["PR Seq"].apply(lambda x: pd.isna(x)).astype(bool)
#all_features["RT SeqNan"] = all_features["RT Seq"].apply(lambda x: pd.isna(x)).astype(bool)
numeric_features = all_features.dtypes[(all_features.dtypes != 'object') & (all_features.dtypes != 'bool')].index
mean_numerical_features = all_features[numeric_features].mean()
std_numerical_features = all_features[numeric_features].std()
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std() + 1e-4)
vt_mean = all_features["VL-t0"].mean()
cd4_mean = all_features["CD4-t0"].mean()
all_features["VL-t0"] = all_features["VL-t0"].fillna(vt_mean)
all_features["CD4-t0"] = all_features["CD4-t0"].fillna(cd4_mean)
all_features.head()

In [104]:
def f_comma(my_str, group=3, char=','):
    if not pd.isna(my_str):
        my_str = str(my_str)
        return char.join(my_str[i:i+group] for i in range(0, len(my_str), group))
    return ''

for index, row in all_features.iterrows():
    all_features['PR Seq'] = all_features['PR Seq'].replace([row['PR Seq']], f_comma(row['PR Seq']))
    all_features['RT Seq'] = all_features['RT Seq'].replace([row['RT Seq']], f_comma(row['RT Seq']))

# Tokenize and Vocab

In [105]:
import collections

def tokenize(seqs):
    return [tokenize_line(seq) for seq in seqs]

def tokenize_line(seq):
    if not pd.isna(seq) and len(seq) > 0 and not pd.isna(seq[0]):
        return list(seq.split(','))
    return []

class Vocab:
    def __init__(self, tokens):
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        self.idx_to_token = ['<unk>', '<mask>']
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self): 
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

tokens_pr = tokenize(all_features["PR Seq"].values)
vocab_pr = Vocab(tokens_pr)
list(vocab_pr.token_to_idx.items())

In [106]:
all_features["PR Seq"] = all_features["PR Seq"].apply(lambda x: vocab_pr[tokenize_line(x)])
all_features["PR Seq"]

In [107]:
tokens_rt = tokenize(all_features["RT Seq"].values)
vocab_rt = Vocab(tokens_rt)
#list(vocab_rt.token_to_idx.items())

In [108]:
all_features["RT Seq"] = all_features["RT Seq"].apply(lambda x: vocab_rt[tokenize_line(x)])
all_features["RT Seq"]

# Network

In [109]:
import math
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 1000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.3):
        super().__init__()
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        #output = self.decoder(output)
        return output
    
def generate_square_subsequent_mask(sz: int) -> Tensor:
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

# Performance optimization => batching

Padding text inputs

In [110]:
def padding_input(seqs):
    # determine max length
    max_length = 0
    for seq in seqs:
        max_length = max(max_length, len(seq))
    result = torch.zeros((seqs.shape[0], max_length), dtype=int)
    for i in range(seqs.shape[0]):
        for j in range(len(seqs[i])):
            result[i][j] = seqs[i][j]
    return result, max_length

In [111]:
pr_data, pr_length = padding_input(all_features["PR Seq"].values)
rt_data, rt_length = padding_input(all_features["RT Seq"].values)

Dataset and DataLoaders

In [112]:
training_size = int(0.7 * n_train)
train_indexes = np.random.choice(n_train, training_size)
numerical_features = torch.tensor(all_features.iloc[:, 2:].astype('float').values, dtype=torch.float32)
dataset_features = torch.utils.data.TensorDataset(numerical_features, labels)

# TO FIX : SPLITTING DATASET

In [113]:
selected_index = torch.zeros((n_train, ), dtype=bool)
selected_index[train_indexes] = True
loader_features_train = torch.utils.data.DataLoader(dataset_features, batch_size=32, shuffle=False)
loader_pr_train = torch.utils.data.DataLoader(pr_data, batch_size=32, shuffle=False)
loader_rt_train = torch.utils.data.DataLoader(rt_data, batch_size=32, shuffle=False)
selected_index = torch.ones((n_train, ), dtype=bool)
selected_index[train_indexes] = False

# Training

In [114]:
def init_layer(m):
    torch.nn.init.xavier_normal_(m.weight, gain=torch.nn.init.calculate_gain('tanh'))
    torch.nn.init.constant_(m.bias, 0)
    return m

class Network(nn.Module):

    def __init__(self):
        super().__init__()
        self.dropout = nn.Dropout(p=0.5)
        self.tanh = nn.Tanh()
        self.ln1 = init_layer(nn.Linear(4, 8))
        self.ln2 = init_layer(nn.Linear(8, 8))
        self.ln3 = init_layer(nn.Linear(8, 1))
        self.sigmoid = nn.Sigmoid()
        self.pr_model = TransformerModel(len(vocab_pr), 32, 4, 64, 3)
        self.rt_model = TransformerModel(len(vocab_rt), 32, 4, 64, 3)
        self.decoder_pr = init_layer(nn.Linear(32, 1))
        self.decoder_rt = init_layer(nn.Linear(32, 1))
        
    def forward(self, x):
        x = self.dropout(self.tanh(self.ln1(x)))
        x = self.dropout(self.tanh(self.ln2(x)))
        return self.sigmoid(self.ln3(x))
    
    def forward_rt(self, x, mask):
        return self.decoder_rt(self.rt_model(x, mask).mean(dim=1))
    
    def forward_pr(self, x, mask):
        return self.decoder_pr(self.pr_model(x, mask).mean(dim=1))

In [115]:
!pip install mlm-pytorch

In [116]:
from mlm_pytorch import MLM

def self_init_layer(m):
    m.weight.data.uniform_(-0.1, 0.1)
    torch.nn.init.constant_(m.bias, 0)
    return m

class myTransformer(nn.Module):
    
    def __init__(self, vocab_len=len(vocab_pr)):
        super().__init__()
        self.model = TransformerModel(vocab_len, 32, 4, 64, 3)
        self.decoder = self_init_layer(nn.Linear(32, vocab_len))
        
    def forward(self, x, mask=None):
        src_mask = generate_square_subsequent_mask(x.size(0))
        if torch.cuda.is_available():
            src_mask = src_mask.cuda()
        return self.decoder(self.model(x, src_mask))


transformer_pr = myTransformer(len(vocab_pr))
transformer_pr.train()
trainer = MLM(
    transformer_pr,
    mask_token_id = 1,          # the token id reserved for masking
    pad_token_id = 0,           # the token id for padding
    mask_prob = 0.2,           # masking probability for masked language modeling
    replace_prob = 0.90,        
    mask_ignore_token_ids = [0]  
).cuda()
opt = torch.optim.Adam(trainer.parameters(), lr=3e-4)
nb_iterations = 50
for j in range(nb_iterations):
    training_loss = 0
    for batch_idx, data in enumerate(zip(loader_pr_train, loader_rt_train, loader_features_train)):
        opt.zero_grad()
        prs, rts, (num_features, label) = data
        prs = prs.cuda()
        loss = trainer(prs)
        loss.backward()
        opt.step()
        training_loss += loss.cpu().item()
    print(f'iter {j} training loss {training_loss}')
    
transformer_rt = myTransformer(len(vocab_rt))
transformer_rt.train()
trainer = MLM(
    transformer_rt,
    mask_token_id = 1,          # the token id reserved for masking
    pad_token_id = 0,           # the token id for padding
    mask_prob = 0.2,           # masking probability for masked language modeling
    replace_prob = 0.90,        
    mask_ignore_token_ids = [0]  
).cuda()
opt = torch.optim.Adam(trainer.parameters(), lr=3e-4)
nb_iterations = 50
for j in range(nb_iterations):
    training_loss = 0
    for batch_idx, data in enumerate(zip(loader_pr_train, loader_rt_train, loader_features_train)):
        opt.zero_grad()
        prs, rts, (num_features, label) = data
        loss = trainer(rts.cuda())
        loss.backward()
        opt.step()
        training_loss += loss.cpu().item()
    print(f'iter {j} training loss {training_loss}')

In [117]:
net = Network()
net.pr_model = transformer_pr.model
net.rt_model = transformer_rt.model
net = net.cuda()

criterion=torch.nn.BCELoss(reduction='none')

nb_iterations = 50

optimizer = torch.optim.Adam(net.parameters(), lr=2e-4)

for j in range(nb_iterations):
    correctly_predicted = 0
    total_prediction = 0
    training_loss = 0
    src_mask = generate_square_subsequent_mask(32).cuda()
    for batch_idx, data in enumerate(zip(loader_pr_train, loader_rt_train, loader_features_train)):
        optimizer.zero_grad()
        prs, rts, (num_features, label) = data
        prs = prs.cuda(non_blocking=True)
        rts = rts.cuda(non_blocking=True)
        num_features = num_features.cuda(non_blocking=True)
        label = label.cuda(non_blocking=True)
        if rts.size(0) != 32:  # only on last batch
            src_mask = src_mask[:rts.size(0), :rts.size(0)]
        prs = net.sigmoid(net.forward_pr(prs, src_mask))
        rts = net.sigmoid(net.forward_rt(rts, src_mask))
        smoothed_label = label * 0.8 + 0.1
        loss = criterion(prs.view(-1), smoothed_label) + criterion(rts.view(-1), smoothed_label)
        coefficient = torch.ones_like(loss).cuda()
        coefficient[label == True] += 3
        loss = loss * coefficient
        loss = loss.mean()
        loss.backward()
        optimizer.step()
        training_loss += loss.detach().cpu().item()
    print(f'iter {j} training loss {training_loss}')

nb_iterations = 400

optimizer = torch.optim.AdamW(net.parameters(), lr=3e-4, weight_decay=3e-3)

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, nb_iterations)

net.train()
for j in range(nb_iterations):
    correctly_predicted = 0
    total_prediction = 0
    training_loss = 0
    src_mask = generate_square_subsequent_mask(32).cuda()
    for batch_idx, data in enumerate(zip(loader_pr_train, loader_rt_train, loader_features_train)):
        optimizer.zero_grad()
        prs, rts, (num_features, label) = data
        prs = prs.cuda(non_blocking=True)
        rts = rts.cuda(non_blocking=True)
        num_features = num_features.cuda(non_blocking=True)
        label = label.cuda(non_blocking=True)
        if rts.size(0) != 32:  # only on last batch
            src_mask = src_mask[:rts.size(0), :rts.size(0)]
        prs = net.forward_pr(prs, src_mask)
        rts = net.forward_rt(rts, src_mask)
        x = torch.hstack((num_features, prs, rts))
        output = net(x)
        smoothed_label = label * 0.8 + 0.1
        loss = criterion(output.view(-1), smoothed_label)
        coefficient = torch.ones_like(loss).cuda()
        coefficient[label == True] += 3
        loss = loss * coefficient
        loss = loss.mean()
        loss.backward()
        optimizer.step()
        training_loss += loss.detach().cpu().item()
        predicted = torch.ge(output, 0.5).view(-1)
        correctly_predicted += torch.sum(label == predicted).detach().cpu()
        total_prediction += output.shape[0]
    scheduler.step()
    print(f'iter {j} training loss {training_loss} accuracy training {correctly_predicted / total_prediction}')

# Test data

In [118]:
test_data = pd.read_csv('/kaggle/input/modded-test/test_data_mod.csv')
train_data.head()
labels = torch.tensor(test_data["Resp"].values, dtype=torch.float)
n_labels = labels.shape[0]
n_train = train_data.shape[0]
all_features = test_data.iloc[:, 2:]
# one can assume if Seqs are not present it is a bad sign for survival
#all_features["PR SeqNan"] = all_features["PR Seq"].apply(lambda x: pd.isna(x)).astype(bool)
#all_features["RT SeqNan"] = all_features["RT Seq"].apply(lambda x: pd.isna(x)).astype(bool)
numeric_features = all_features.dtypes[(all_features.dtypes != 'object') & (all_features.dtypes != 'bool')].index
mean_numerical_features = all_features[numeric_features].mean()
std_numerical_features = all_features[numeric_features].std()
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std() + 1e-4)
vt_mean = all_features["VL-t0"].mean()
cd4_mean = all_features["CD4-t0"].mean()
all_features["VL-t0"] = all_features["VL-t0"].fillna(vt_mean)
all_features["CD4-t0"] = all_features["CD4-t0"].fillna(cd4_mean)
all_features.head()

In [119]:
def f_comma(my_str, group=3, char=','):
    if not pd.isna(my_str):
        my_str = str(my_str)
        return char.join(my_str[i:i+group] for i in range(0, len(my_str), group))
    return ''

for index, row in all_features.iterrows():
    all_features['PR Seq'] = all_features['PR Seq'].replace([row['PR Seq']], f_comma(row['PR Seq']))
    all_features['RT Seq'] = all_features['RT Seq'].replace([row['RT Seq']], f_comma(row['RT Seq']))
    
all_features["PR Seq"] = all_features["PR Seq"].apply(lambda x: vocab_pr[tokenize_line(x)])
all_features["RT Seq"] = all_features["RT Seq"].apply(lambda x: vocab_rt[tokenize_line(x)])
pr_data, pr_length = padding_input(all_features["PR Seq"].values)
rt_data, rt_length = padding_input(all_features["RT Seq"].values)

numerical_features = torch.tensor(all_features.iloc[:, 2:].astype('float').values, dtype=torch.float32)
dataset_features = torch.utils.data.TensorDataset(numerical_features, labels)
loader_features_train = torch.utils.data.DataLoader(dataset_features, batch_size=32, shuffle=False)
loader_pr_train = torch.utils.data.DataLoader(pr_data, batch_size=32, shuffle=False)
loader_rt_train = torch.utils.data.DataLoader(rt_data, batch_size=32, shuffle=False)

In [120]:
torch.save(net.state_dict(), 'model.pt')
net = net.cuda()
net.eval()
correctly_predicted = 0
total_prediction = 0
src_mask = generate_square_subsequent_mask(32).cuda()
with torch.no_grad():
    for batch_idx, data in enumerate(zip(loader_pr_train, loader_rt_train, loader_features_train)):
        prs, rts, (num_features, label) = data
        prs = prs.cuda(non_blocking=True)
        rts = rts.cuda(non_blocking=True)
        num_features = num_features.cuda(non_blocking=True)
        label = label.cuda(non_blocking=True)
        if rts.size(0) != 32:  # only on last batch
            src_mask = src_mask[:rts.size(0), :rts.size(0)]
        prs = net.forward_pr(prs, src_mask)
        rts = net.forward_rt(rts, src_mask)
        x = torch.hstack((num_features, prs, rts))
        output = net(x)
        predicted = torch.ge(output, 0.5).view(-1)
        correctly_predicted += torch.sum(label == predicted).cpu()
        total_prediction += output.shape[0]
    print(f'Test accuracy {correctly_predicted / total_prediction}')