In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preliminaries

In [2]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
import wandb
wandb.init(project="HIV_kaggle")

Parse and look at first 5 rows

In [3]:
train_data = pd.read_csv('/kaggle/input/hivprogression/training_data.csv')
train_data.head()

In [4]:
n_train = train_data.shape[0]

We have to remove the first two columns

In [5]:
all_features = train_data.iloc[:, 2:]
# one can assume if Seqs are not present it is a bad sign for survival
all_features["PR SeqNan"] = all_features["PR Seq"].apply(lambda x: pd.isna(x)).astype(bool)
all_features["RT SeqNan"] = all_features["RT Seq"].apply(lambda x: pd.isna(x)).astype(bool)
numeric_features = all_features.dtypes[(all_features.dtypes != 'object') & (all_features.dtypes != 'bool')].index
mean_numerical_features = all_features[numeric_features].mean()
std_numerical_features = all_features[numeric_features].std()
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std() + 1e-4)
vt_mean = all_features["VL-t0"].mean()
cd4_mean = all_features["CD4-t0"].mean()
all_features["VL-t0"] = all_features["VL-t0"].fillna(vt_mean)
all_features["CD4-t0"] = all_features["CD4-t0"].fillna(cd4_mean)
all_features.head()

# Tokenize and Vocab

In [6]:
import collections

def tokenize(seqs):
    return [tokenize_line(seq) for seq in seqs]

def tokenize_line(seq):
    if not pd.isna(seq):
        return list(seq)
    return []

class Vocab:
    def __init__(self, tokens):
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        self.idx_to_token = ['<unk>']
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self): 
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

tokens_pr = tokenize(all_features["PR Seq"].values)
vocab_pr = Vocab(tokens_pr)
list(vocab_pr.token_to_idx.items())

In [7]:
all_features["PR Seq"] = all_features["PR Seq"].apply(lambda x: vocab_pr[tokenize_line(x)])
all_features["PR Seq"]

In [8]:
tokens_rt = tokenize(all_features["RT Seq"].values)
vocab_rt = Vocab(tokens_rt)
list(vocab_rt.token_to_idx.items())

In [9]:
all_features["RT Seq"] = all_features["RT Seq"].apply(lambda x: vocab_rt[tokenize_line(x)])
all_features["RT Seq"]

In [10]:
training_size = int(0.7 * n_train)
nb_iterations = 10
for j in range(nb_iterations):
    for i, data in enumerate(all_features.values):
        if i >= training_size:
            break
        # TODO