# Preliminaries

In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils import data
import wandb
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, Dense, LSTM, Dropout
from keras.losses import BinaryCrossentropy
from keras.models import Sequential
from keras.optimizers import adam_v2
from keras.preprocessing.sequence import pad_sequences

In [2]:
config = dict()

config['path_workspace'] = 'C:\\Users\\SafontAndreu\\Workspace\\Visual Studio Code\\ps_hiv\\'

config['path_database'] = config.get('path_workspace') + 'data\\'

# Data

In [3]:
#X_name = 'PR Seq'
#X_name = 'RT Seq'
#X_name = 'Seq' # concatenate sequences
#X_name = 'Count' # concatenate counts
#X_name = 'All' # concatenate viral/ct load with sequences

y_name = 'Resp' # binary prognosis

In [4]:
train_data = df = pd.read_csv(config.get('path_database')+'training_data.csv', header=0, index_col=False, encoding='utf-8', low_memory=False)
test_data = df = pd.read_csv(config.get('path_database')+'test_data_mod.csv', header=0, index_col=False, encoding='utf-8', low_memory=False)
sol_data = df = pd.read_csv(config.get('path_database')+'hivprogression_solution.csv', header=0, index_col=False, encoding='utf-8', low_memory=False)
train_data.head()

Unnamed: 0,PatientID,Resp,PR Seq,RT Seq,VL-t0,CD4-t0
0,1,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCCCAATAAGGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAGCTAAAGCCAGGAA...,4.3,145
1,2,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.6,224
2,3,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAAGGTAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.2,1017
3,4,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAATAAGGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,5.7,206
4,5,0,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCGCAGTAAAGATAG...,CCCATTAGTCCTATTGAAACTGTACCAGTAAAATTAAAGCCAGGAA...,3.5,572


## Tokenize and vocab

In [5]:
import collections

def tokenize(seqs):
    return [tokenize_line(seq) for seq in seqs]

def tokenize_line(seq):
    if not pd.isna(seq):
        return list(seq)
    return []

class Vocab:
    def __init__(self, tokens):
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        self.idx_to_token = ['<unk>']
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self): 
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

## Data preprocessing (training)

In [6]:
seq_pr = train_data['PR Seq']

seq_pr_unique = ''
for ele in seq_pr:
    if isinstance(ele, str):
        seq_pr_unique += ''.join(set(ele))
seq_pr_unique = ''.join(set(seq_pr_unique))
seq_pr_unique = ''.join(sorted(seq_pr_unique))
print('Unique nucleotides in PR Sequence = ', seq_pr_unique)



seq_rt = train_data['RT Seq']

seq_rt_unique = ''
for ele in seq_rt:
    if isinstance(ele, str):
        seq_rt_unique += ''.join(set(ele))
seq_rt_unique = ''.join(set(seq_rt_unique))
seq_rt_unique = ''.join(sorted(seq_rt_unique))
print('Unique nucleotides in RT Sequence = ', seq_rt_unique)

Unique nucleotides in PR Sequence =  ABCDGHKMNRSTVWY
Unique nucleotides in RT Sequence =  ABCDGHKMNRSTVWY


In [7]:
all_features = train_data.iloc[:, 2:]
print(all_features.shape)
# one can assume if Seqs are not present it is a bad sign for survival
all_features["PR SeqNan"] = all_features["PR Seq"].apply(lambda x: pd.isna(x)).astype(bool)
all_features["RT SeqNan"] = all_features["RT Seq"].apply(lambda x: pd.isna(x)).astype(bool)
numeric_features = all_features.dtypes[(all_features.dtypes != 'object') & (all_features.dtypes != 'bool')].index
mean_numerical_features = all_features[numeric_features].mean()
std_numerical_features = all_features[numeric_features].std()
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / x.std() + 1e-4)
vt_mean = all_features["VL-t0"].mean()
cd4_mean = all_features["CD4-t0"].mean()
all_features["VL-t0"] = all_features["VL-t0"].fillna(vt_mean)
all_features["CD4-t0"] = all_features["CD4-t0"].fillna(cd4_mean)
all_features.head()

# Add results
all_features[y_name] = train_data[y_name]
print(all_features.shape)

(1000, 4)
(1000, 7)


### Subsets (training)

#### Select input

In [8]:
X_name = 'PR Seq'
tokens = tokenize(all_features[X_name].values)
vocab = Vocab(tokens)
list(vocab.token_to_idx.items())
all_features[X_name] = all_features[X_name].apply(lambda x: vocab[tokenize_line(x)])
all_features[X_name]

X_name = 'RT Seq'
tokens = tokenize(all_features[X_name].values)
vocab = Vocab(tokens)
list(vocab.token_to_idx.items())
all_features[X_name] = all_features[X_name].apply(lambda x: vocab[tokenize_line(x)])
all_features[X_name]

0      [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
1      [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
2      [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
3      [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
4      [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
                             ...                        
995    [4, 4, 4, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
996    [4, 4, 4, 1, 2, 6, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
997    [4, 4, 4, 1, 2, 6, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
998    [4, 4, 2, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
999    [4, 4, 2, 1, 2, 2, 1, 3, 2, 4, 4, 2, 1, 2, 2, ...
Name: RT Seq, Length: 1000, dtype: object

## Data preprocessing (test)

In [9]:
all_features_test = test_data.iloc[:, 2:]
print(all_features_test.shape)
# one can assume if Seqs are not present it is a bad sign for survival
all_features_test["PR SeqNan"] = all_features_test["PR Seq"].apply(lambda x: pd.isna(x)).astype(bool)
all_features_test["RT SeqNan"] = all_features_test["RT Seq"].apply(lambda x: pd.isna(x)).astype(bool)
numeric_features_test = all_features_test.dtypes[(all_features_test.dtypes != 'object') & (all_features_test.dtypes != 'bool')].index
mean_numerical_features_test = all_features_test[numeric_features_test].mean()
std_numerical_features_test = all_features_test[numeric_features_test].std()
all_features_test[numeric_features_test] = all_features_test[numeric_features_test].apply(lambda x: (x - x.mean()) / x.std() + 1e-4)
vt_mean_test = all_features_test["VL-t0"].mean()
cd4_mean_test = all_features_test["CD4-t0"].mean()
all_features_test["VL-t0"] = all_features_test["VL-t0"].fillna(vt_mean_test)
all_features_test["CD4-t0"] = all_features_test["CD4-t0"].fillna(cd4_mean_test)
all_features_test.head()

# Add results
all_features_test[y_name] = test_data[y_name]
print(all_features_test.shape)

(692, 4)
(692, 7)


### Subsets (test)

#### Select input

In [10]:
X_name = 'PR Seq'
tokens = tokenize(all_features_test[X_name].values)
vocab = Vocab(tokens)
list(vocab.token_to_idx.items())
all_features_test[X_name] = all_features_test[X_name].apply(lambda x: vocab[tokenize_line(x)])
#all_features_test[X_name]

X_name = 'RT Seq'
tokens = tokenize(all_features_test[X_name].values)
vocab = Vocab(tokens)
list(vocab.token_to_idx.items())
all_features_test[X_name] = all_features_test[X_name].apply(lambda x: vocab[tokenize_line(x)])
#all_features_test[X_name]

## Select input/output


In [11]:
max_sequence_length_pr = max(all_features['PR Seq'].apply(len))
max_sequence_length_rt = max(all_features['RT Seq'].apply(len))

padded_x_pr = pad_sequences(all_features['PR Seq'], maxlen=max_sequence_length_pr, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_x_rt = pad_sequences(all_features['RT Seq'], maxlen=max_sequence_length_rt, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_x_pr_test = pad_sequences(all_features_test['PR Seq'], maxlen=max_sequence_length_pr, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_x_rt_test = pad_sequences(all_features_test['RT Seq'], maxlen=max_sequence_length_rt, value = 0.0) # 0.0 because it corresponds with <PAD>

In [12]:
print(type(padded_x_pr))

<class 'numpy.ndarray'>


In [13]:
# from numpy to keras tensors
import numpy as np
import keras.backend as K

#padded_x_pr = K.constant(padded_x_pr)
#padded_x_rt = K.constant(padded_x_rt)
#padded_x_pr_test = K.constant(padded_x_pr_test)
#padded_x_rt_test = K.constant(padded_x_rt_test)

x_vl_ = all_features['VL-t0'].values
x_cd_ = all_features['CD4-t0'].values
y_ = all_features[y_name].values

x_vl = np.array(x_vl_).reshape(x_vl_.shape + (1,))
x_cd = np.array(x_cd_).reshape(x_cd_.shape + (1,))
y = np.array(y_).reshape(y_.shape + (1,))


#x_vl = K.constant(x_vl)
#x_cd = K.constant(x_cd)

print(x_vl.shape)
print(x_cd.shape)
print(type(x_vl))

(1000, 1)
(1000, 1)
<class 'numpy.ndarray'>


In [14]:
'''x_vl = all_features['VL-t0'].values
arr = np.array(x_vl).reshape(x_vl.shape + (1,))
print(arr.shape)'''

"x_vl = all_features['VL-t0'].values\narr = np.array(x_vl).reshape(x_vl.shape + (1,))\nprint(arr.shape)"

# Training

## Parameters

In [15]:
class_weight = {0: 0.2, # no improvement (80%)
                1: 0.8} # improvement (20%)

additional_metrics = ['accuracy']
batch_size = 32
embedding_output_dims = 8
loss_function = BinaryCrossentropy()
print('Max. seq length for PR = ', max_sequence_length_pr)
print('Max. seq length for RT = ', max_sequence_length_rt)
num_distinct_words = len(vocab)
print('Size of vocabular = ', num_distinct_words)
epochs = 5
lr = 2e-3
optimizer = adam_v2.Adam(learning_rate=lr, decay=lr/epochs)
validation_split = 0.20
verbosity_mode = 1

# Disable eager execution
tf.compat.v1.disable_eager_execution()

Max. seq length for PR =  297
Max. seq length for RT =  1482
Size of vocabular =  16


## Model

In [19]:
import tensorflow as tf
from keras.layers import *
from keras.models import Sequential, Model
import numpy as np

'''padded_x_pr_tf = tf.convert_to_tensor(value=padded_x_pr, dtype='int32')
padded_x_rt_tf = tf.convert_to_tensor(value=padded_x_rt, dtype='int32')
x_vl_tf = tf.convert_to_tensor(value=x_vl, dtype='int32')
x_cd_tf = tf.convert_to_tensor(value=x_cd, dtype='int32')

input_pr = Input(tensor=padded_x_pr_tf)
input_rt = Input(tensor=padded_x_rt_tf)
input_vl = Input(tensor=x_vl_tf)
input_cd = Input(tensor=x_cd_tf)'''

'''(1000, 297)
(1000, 1482)
(1000, 1)
(1000, 1)
(1000, 1)'''
input_pr = Input(shape=(297,))
input_rt = Input(shape=(1482,))
input_vl = Input(shape=(1,))
input_cd = Input(shape=(1,))

#input = Concatenate(axis=1)([input_vl, input_cd])
input = Concatenate(axis=1)([input_pr, input_rt, input_vl, input_cd])

x = Dense(128)(input)
x = Dense(64)(x)
x = Dense(32)(x)
x = Dense(16)(x)
x = Dense(1)(x)

#model = Model(inputs=[input_pr, input_rt, input_vl, input_cd], outputs=x)
model = Model(inputs=[input_pr, input_rt, input_vl, input_cd], outputs=x)

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 297)]        0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 1482)]       0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, 1)]          0           []                               
                                                                                            

In [20]:
# Disable eager execution
#tf.compat.v1.disable_eager_execution()

# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

# Train the model
history = model.fit([padded_x_pr, padded_x_rt, x_vl, x_cd], y, epochs=epochs, batch_size=10, verbose=verbosity_mode, validation_split=0.2)

Train on 800 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluation

In [18]:
padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>

test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

NameError: name 'x_test' is not defined

# Template

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop
import numpy as np

input1 = Input(shape=(336,))
input2 = Input(shape=(336,))
input = Concatenate()([input1, input2])
x = Dense(64, activation = 'relu')(input)
x = Dense(16, activation = 'relu')(x)
x = Dense(1, activation = 'sigmoid')(x)
model = Model(inputs=[input1, input2], outputs=x)
model.summary()

In [None]:
model.compile(
    optimizer = RMSprop(lr=0.02,rho=0.9,epsilon=None,decay=0),
    loss = 'mean_squared_error'
)


x1, x2 = np.random.randn(1000, 336), np.random.randn(1000, 336,)
y = np.random.randn(1000, 1)
print(x1.shape)
print(x2.shape)
print(y.shape)

model.fit([x1, x2], y, epochs = 10)