In [1]:
import torch
from torch import nn
import numpy as np

In [2]:
f = open('train.csv')
lines = []
for line in f:
    lines.append(line.strip())
f.close()

In [3]:
data = []
for line in lines[1:]:
    line = line.split(',')
    if len(line) != 3:
        print('Error: ', line)
        break
    data.append([line[1], int(line[2])])

In [4]:
len(data)

2097

In [5]:
# import json

# full_text = ''
# for d in data:
#     full_text = full_text + d[]
    
# # Join all the sentences together and extract the unique characters from the combined sentences
# chars = set(full_text)

# # Creating a dictionary that maps integers to the characters
# int2char = dict(enumerate(chars))

# # Creating another dictionary that maps characters to integers
# char2int = {char: ind for ind, char in int2char.items()}

# with open('char2int.json', 'w') as file:
#      json.dump(char2int, file) # use `json.loads` to do the reverse


In [6]:
import json

char2int = {}
with open('char2int.json') as file:
     char2int = json.load(file)


In [7]:
# valid_size = 300
# valid = data[:valid_size]
# train = data[valid_size:]

In [8]:
def writeToFile(fname, data):
    f = open(fname, 'w')
    for d in data:
        line = d[0] + '\t' + str(d[1])
        f.write(line + '\n')
    f.close()
def readFile(fname):
    lines = []
    f = open(fname, 'r')
    for line in f:
        line = line.strip().split('\t')
        line[1] = int(line[1])
        lines.append(line)
    return lines

In [9]:
# writeToFile('train.txt', train)
# writeToFile('valid.txt', valid)
valid = readFile('valid.txt')
train = readFile('train.txt')

In [10]:
def getMaxLength(data):
    maxlen = 0
    for d in data:
        if len(d[0]) > maxlen:
            maxlen = len(d[0])
    return maxlen

def strToLong(s, char2int, max_length):
    out = []
    for char in s:
        out.append(char2int[char])
    for _ in range(max_length - len(s)):
        out.append(0)
    return np.array(out, dtype=np.long)

def make_batch(data, char2int):
    X = []
    Y = []
    lengths = []
    max_length = getMaxLength(data)
    for d in data:
        x = d[0]
        y = d[1]
        Y.append(float(y))
        X.append(strToLong(x, char2int, max_length))
        lengths.append(len(x))
    X = np.stack(X, axis=0)
    Y = np.array(Y)
    return X, lengths, Y

In [11]:
make_batch(train[:10], char2int)

(array([[41, 38, 27, ...,  0,  0,  0],
        [38, 13, 41, ...,  0,  0,  0],
        [38, 43,  6, ...,  0,  0,  0],
        ...,
        [38, 38, 14, ...,  0,  0,  0],
        [43, 38, 38, ...,  0,  0,  0],
        [38, 38, 38, ..., 43, 19, 43]]),
 [17, 20, 40, 39, 22, 26, 21, 58, 30, 128],
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [12]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True,enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs


In [13]:
#define hyperparameters
size_of_vocab = len(char2int)
embedding_dim = 200
num_hidden_nodes = 512
num_output_nodes = 1
num_layers = 3
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')


classifier(
  (embedding): Embedding(44, 200)
  (lstm): LSTM(200, 512, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 15,533,665 trainable parameters


In [14]:
import torch.optim as optim
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
def aucroc(preds, y):
    y_true = y.cpu().detach().numpy()
    y_scores = preds.cpu().detach().numpy()
    return roc_auc_score(y_true, y_scores)

def aucprc(preds, y):
    y_true = y.cpu().detach().numpy()
    y_scores = preds.cpu().detach().numpy()
    lr_precision, lr_recall, _ = precision_recall_curve(y_true, y_scores)
    return auc(lr_recall, lr_precision)
    
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')  

#set batch size
batch_size = 5000


#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()


#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)


In [15]:
model.train()
optimizer.zero_grad()   

In [18]:
import random
from tqdm import tqdm
num_epochs = 2
batch_size = 3000
model.train()
validate_every = 10
for i in range(num_epochs):
    random.shuffle(train)
    model.train()
    total_loss = 0
    for j in range(0, len(train), batch_size):
        optimizer.zero_grad()   
        text, text_lengths, y = make_batch(train[j:batch_size+j], char2int)
        text = torch.from_numpy(text).to(device)
        y = torch.tensor(y, dtype=torch.float32).to(device)
        predictions = model(text, text_lengths).squeeze()  
        loss = criterion(predictions, y)  
        loss.backward()       
        #update the weights
        optimizer.step()   
        total_loss += loss.item()
    #validate
    if i%validate_every == 0:
        model.eval()
        text, text_lengths, y = make_batch(valid, char2int)
        text = torch.from_numpy(text).to(device)
        y = torch.tensor(y, dtype=torch.float32).to(device)
        predictions = model(text, text_lengths).squeeze()  
    #     acc = binary_accuracy(predictions, y)   
        auc1 = aucroc(predictions, y)
        auc2 = aucprc(predictions, y)
        print(auc1, auc2, total_loss)


0.5929537590437244 0.2956448992352235 1.4693373486807104e-06


In [196]:
len(y)

300

In [116]:
text, text_lengths, y = make_batch(train[:BATCH_SIZE], char2int)
text = torch.from_numpy(text).to(device)
y = torch.tensor(y, dtype=torch.float32).to(device)

In [117]:
predictions = model(text, text_lengths).squeeze()  

In [198]:

y_true = y.cpu().detach().numpy()
y_scores = predictions.cpu().detach().numpy()

lr_precision, lr_recall, _ = precision_recall_curve(y_true, y_scores)
auc(lr_recall, lr_precision)

In [199]:
auc(lr_recall, lr_precision)

0.5636770035961476

In [119]:
loss = criterion(predictions, y)  
loss

tensor(0.0386, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)

In [122]:
acc = binary_accuracy(predictions, y)   
acc

tensor(1., device='cuda:0')

In [121]:
loss.backward()       

#update the weights
optimizer.step()      


As we're going to predict the next character in the sequence at each time step, we'll have to divide each sentence into

- Input data
    - The last input character should be excluded as it does not need to be fed into the model
- Target/Ground Truth Label
    - One time-step ahead of the Input data as this will be the "correct answer" for the model at each time step corresponding to the input data

In [6]:
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
    input_seq.append(text[i][:-1])
    
    # Remove firsts character for target sequence
    target_seq.append(text[i][1:])
    print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))

Input Sequence: hey how are yo
Target Sequence: ey how are you
Input Sequence: good i am fine
Target Sequence: ood i am fine 
Input Sequence: have a nice da
Target Sequence: ave a nice day


Now we can convert our input and target sequences to sequences of integers instead of characters by mapping them using the dictionaries we created above. This will allow us to one-hot-encode our input sequence subsequently.

In [7]:
for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

Before encoding our input sequence into one-hot vectors, we'll define 3 key variables:

- *dict_size*: The number of unique characters that we have in our text
    - This will determine the one-hot vector size as each character will have an assigned index in that vector
- *seq_len*: The length of the sequences that we're feeding into the model
    - As we standardised the length of all our sentences to be equal to the longest sentences, this value will be the max length - 1 as we removed the last character input as well
- *batch_size*: The number of sentences that we defined and are going to feed into the model as a batch

In [8]:
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

We also defined a helper function that creates arrays of zeros for each character and replaces the corresponding character index with a **1**.

In [9]:
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)
print("Input shape: {} --> (Batch Size, Sequence Length, One-Hot Encoding Size)".format(input_seq.shape))

Input shape: (3, 14, 17) --> (Batch Size, Sequence Length, One-Hot Encoding Size)


Since we're done with all the data pre-processing, we can now move the data from numpy arrays to PyTorch's very own data structure - **Torch Tensors**

In [10]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

Now we've reached the fun part of this project! We'll be defining the model using the Torch library, and this is where you can add or remove layers, be it fully connected layers, convolutational layers, vanilla RNN layers, LSTM layers, and many more! In this post, we'll be using the basic nn.rnn to demonstrate a simple example of how RNNs can be used.

Before we start building the model, let's use a build in feature in PyTorch to check the device we're running on (CPU or GPU). This implementation will not require GPU as the training is really simple. However, as you progress on to large datasets and models with millions of trainable parameters, using the GPU will be very important to speed up your training.

In [11]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


To start building our own neural network model, we can define a class that inherits PyTorch’s base class (nn.module) for all neural network modules. After doing so, we can start defining some variables and also the layers for our model under the constructor. For this model, we’ll only be using 1 layer of RNN followed by a fully connected layer. The fully connected layer will be in-charge of converting the RNN output to our desired output shape.

We’ll also have to define the forward pass function under forward() as a class method. The order the forward function is sequentially executed, therefore we’ll have to pass the inputs and the zero-initialized hidden state through the RNN layer first, before passing the RNN outputs to the fully-connected layer. Note that we are using the layers that we defined in the constructor.

The last method that we have to define is the method that we called earlier to initialize the hidden state - init_hidden(). This basically creates a tensor of zeros in the shape of our hidden states.

In [12]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

After defining the model above, we'll have to instantiate the model with the relevant parameters and define our hyperparamters as well. The hyperparameters we're defining below are:

- *n_epochs*: Number of Epochs --> This refers to the number of times our model will go through the entire training dataset
- *lr*: Learning Rate --> This affects the rate at which our model updates the weights in the cells each time backpropogation is done
    - A smaller learning rate means that the model changes the values of the weight with a smaller magnitude
    - A larger learning rate means that the weights are updated to a larger extent for each time step

Similar to other neural networks, we have to define the optimizer and loss function as well. We’ll be using CrossEntropyLoss as the final output is basically a classification task.

In [13]:
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
# We'll also set the model to the device that we defined earlier (default is CPU)
model = model.to(device)

# Define hyperparameters
n_epochs = 100
lr=0.01

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Now we can begin our training! As we only have a few sentences, this training process is very fast. However, as we progress, larger datasets and deeper models mean that the input data is much larger and the number of parameters within the model that we have to compute is much more.

In [14]:
# Training Run
input_seq = input_seq.to(device)
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    #input_seq = input_seq.to(device)
    output, hidden = model(input_seq)
    output = output.to(device)
    target_seq = target_seq.to(device)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/100............. Loss: 2.3973
Epoch: 20/100............. Loss: 2.1102
Epoch: 30/100............. Loss: 1.7422
Epoch: 40/100............. Loss: 1.3161
Epoch: 50/100............. Loss: 0.9390
Epoch: 60/100............. Loss: 0.6450
Epoch: 70/100............. Loss: 0.4317
Epoch: 80/100............. Loss: 0.2941
Epoch: 90/100............. Loss: 0.2117
Epoch: 100/100............. Loss: 0.1626


Let’s test our model now and see what kind of output we will get. Before that, let’s define some helper function to convert our model output back to text.

In [15]:
def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)
    character = character.to(device)
    
    out, hidden = model(character)

    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden

In [16]:
def sample(model, out_len, start='hey'):
    model.eval() # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, h = predict(model, chars)
        chars.append(char)

    return ''.join(chars)

In [23]:
sample(model, 15, 'good u')

'good u am fine '

As we can see, the model is able to come up with the sentence ‘good i am fine ‘ if we feed it with the words ‘good’, achieving what we intended for it to do!