In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchtext
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import re


In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [4]:
df_train_pos = df_train[df_train.target == 1]
df_train_neg = df_train[df_train.target == 0]
print(f"No. of positive training examples = {len(df_train_pos)}")
print(f"No. of negative training examples = {len(df_train_neg)}")
train_keywords_unique = df_train.keyword.unique()
print(f"No. of unique keywords = {len(train_keywords_unique)}")
df_train_notnull_keywords = df_train[~df_train.keyword.isnull()]
print(f"No of train examples with keyword not null = {len(df_train_notnull_keywords)}")
df_train_notnull_keywords.head()

No. of positive training examples = 3271
No. of negative training examples = 4342
No. of unique keywords = 222
No of train examples with keyword not null = 7552


Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N,0


In [5]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    # get the target data
    y = df["target"].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df        

df_train = strat_kfold_dataframe(df_train, target_col_name="target", num_folds=5)    

Preprocess the tweets 

In [7]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE
        #tweet = tweet + " " + keyword
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            # remove stopwords and punctuation
            #if (word.isalpha() and len(word) > 2 #and word not in stop 
            #    and word not in string.punctuation):
                #stem_word = stemmer.stem(word)  # stemming word            
                #lem_word = lemmatizer.lemmatize(word)
                #tweets_clean.append(lem_word) 
                tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [8]:
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))

In [9]:
df_train["prcsd_tweet_len"].mean()

14.597136477078681

In [10]:
df_train.iloc[50:52, :]

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
50,5174,fatalities,Official Website,#HSE releases annual workplace facilities data. Have a look | http://t.co/h4UshEekxm http://t.co/jNHNX3oISN,0,4,hse releases annual workplace facilities data have a look,9
51,3126,debris,,#??? #?? #??? #??? MH370: Aircraft debris found on La Reunion is from missing Malaysia Airlines ... http://t.co/zxCORQ0A3a,1,0,mh370 aircraft debris found on la reunion is from missing malaysia airlines,12


In [11]:
df_test.head()

Unnamed: 0,id,keyword,location,text,processed_text,prcsd_tweet_len
0,0,no_keyword,,Just happened a terrible car crash,just happened a terrible car crash,6
1,2,no_keyword,,"Heard about #earthquake is different cities, stay safe everyone.",heard about earthquake is different cities stay safe everyone,9
2,3,no_keyword,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all",there is a forest fire at spot pond geese are fleeing across the street i cannot save them all,19
3,9,no_keyword,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,4
4,11,no_keyword,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan,8


## Let us try some deep learning techniques now

In [12]:
def get_word_embedding_dict(embedding_file_path):
    embedding_dict = {}
    with open(embedding_file_path, "r") as f:
        # https://stackoverflow.com/questions/8009882/how-to-read-a-large-file-line-by-line
        for line in f:
            values = line.split()
            word = values[0]
            word_vec = np.asarray(values[1:], "float32")
            embedding_dict[word] = word_vec
    return embedding_dict        

#glove_embedding_dict = get_word_embedding_dict("../../../glove.twitter.27B/glove.twitter.27B.200d.txt")

In [13]:
glove_emb = torchtext.vocab.GloVe(name="twitter.27B", dim=200)

In [14]:
# build tweets vocab from training data

from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(df):
    for index, row in df.iterrows():
        yield row["processed_text"].split()
    
tweet_vocab = build_vocab_from_iterator(yield_tokens(df_train), specials=["<unk>", "<pad>"])    

In [19]:
vocab_dict = tweet_vocab.get_stoi()
vocab_dict["<unk>"]

0

In [20]:
# For the problem specific vocab, get the embedding vectors from the pre-trained embedding
# for each word in vocab and return a matrix of shape vocab_size, embedding_dim. This matrix
# will be the pretrained embedding weight matrix which we will use to create the embedding layer
def get_vocab_pt_emb_matrix(text_vocab, emb):
    embedding_matrix = []
    for token in text_vocab.get_itos():
        embedding_matrix.append(emb[token])
    return torch.stack(embedding_matrix)

pt_emb_weights = get_vocab_pt_emb_matrix(tweet_vocab, glove_emb)
pt_emb_layer = nn.Embedding.from_pretrained(pt_emb_weights)

In [144]:
# [(index, token) for index, token in enumerate(glove_emb.itos) if token == "<unk>"]
#pt_emb_layer(torch.LongTensor([1]))

In [21]:
from torch.nn.utils.rnn import pad_sequence

df_train["vectorized_tweet"] = df_train["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )

#x_seq = df_train["vectorized_tweet"].values.tolist()
# the index for 'pad' token in tweet_vocab is 1.
#x_padded_seq = pad_sequence(x_seq, batch_first=True, padding_value=1)

In [24]:
class VectorizedTweetDataSet(Dataset):
    def __init__(self, tweet_vecs, labels):
        self.tweet_vecs = tweet_vecs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tweet_vec = self.tweet_vecs[idx]
        label = self.labels[idx]
        tweet_len = len(tweet_vec)
        return (tweet_vec, label)


In [25]:
# If the goal is to train with mini-batches, one needs to pad the sequences in each batch. 
# In other words, given a mini-batch of size N, if the length of the largest sequence is L, 
# one needs to pad every sequence with a length of smaller than L with zeros and make their 
# lengths equal to L. Moreover, it is important that the sequences in the batch are in the 
# descending order.
def pad_collate(batch):
    # Each element in the batch is a tuple (data, label)
    # sort the batch (based on tweet word count) in descending order
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    # Also need to store the length of each sequence.This is later needed in order to unpad 
    # the sequences
    seq_len = torch.Tensor([len(x) for x in sequences])
    labels = torch.Tensor([x[1] for x in sorted_batch])
    return sequences_padded, seq_len, labels


### Bidirectional RNN
outputs is of size [src len, batch size, hid dim * num directions] where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN. We can think of the third axis as being the forward and backward hidden states concatenated together other

hidden is of size [n layers * num directions, batch size, hid dim], where [-2, :, :] gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and [-1, :, :] gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).


In [148]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# https://galhever.medium.com/sentiment-analysis-with-pytorch-part-4-lstm-bilstm-model-84447f6c4525
class DisasterModel(nn.Module):
    """The RNN model."""
    def __init__(self, vocab_size, num_layers, is_bidirect, emb_dim, hidden_dim, out_dim, 
                pt_emb_weights, emb_wt_update=False, drop_prob=0.5, **kwargs):
        super(DisasterModel, self).__init__(**kwargs)
        
        self.vocab_size = vocab_size
        self.num_layers = num_layers        
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim        
        self.output_dim = out_dim        
        self.is_bidirect = is_bidirect
        # Embedding layer
        self.emb_layer = nn.Embedding(self.vocab_size, emb_dim)
        # copy the vocab specific weights(emb vectors) from pretrained embeddings to model embedding layer
        self.emb_layer.weight.data.copy_(pt_emb_weights)    
        # whether to update the pretrained embedding layer weights during model training
        self.emb_layer.weight.requires_grad = emb_wt_update            
        # LSTM Layer        
        self.lstm_layer = nn.LSTM(
                        input_size=emb_dim, hidden_size=hidden_dim, batch_first=True, 
                        bidirectional=is_bidirect, num_layers=num_layers, dropout=drop_prob
                        )
        self.dropout = nn.Dropout(p = drop_prob)                        
        
        # If the RNN is bidirectional `num_directions` should be 2, else it should be 1.        
        if not is_bidirect:
            self.num_directions = 1
            # The linear layer is for making predictions 
            # input to linear output layer is of shape num_steps, batch_size, num_hiddens
            # and output is of shape num_steps, batch_size, output_dim
            # Wya is of shape (output_dim, num_hiddens), a_out is of shape (num_hiddens, 1)
            # For the last time step and one sample we have:
            # yt_pred = np.dot(Wya, a_out) + b is of shape (output_dim, 1)
            # replace 1 with m (batch_size) and add num_steps as the first dimension to have
            # vectorized form of the output (num_steps, batch_size, output_dim)
            self.linear = nn.Linear(self.hidden_dim, self.output_dim)
        else:       
            self.num_directions = 2     
            # We take the hidden state only from the last lstm layer, the output from lstm is [batch_size, hidden_dim * num_directions]
            # This is fed as input to FC layer which outputs logits in shape [batch_size, output_dim]
            self.linear = nn.Linear(self.hidden_dim * self.num_directions, self.output_dim)
            # If we are taking hidden states from each of the lstm layers then linear layer could be like this.
            # self.linear = nn.Linear(self.hidden_dim * self.num_directions * num_layers, self.output_dim)
        # The activation layer which converts output to 0 or 1            
        self.act = nn.Sigmoid()            

    def forward(self, inputs, input_lengths, state):        
        # inputs is of shape batch_size, num_steps(sequence length which is the length of
        # longest text sequence). Each row of inputs is 1d LongTensor array of length 
        # num_steps containing word index. Using the embedding layer we want to convert
        # each word index to its corresponding word vector of dimension emb_dim
        batch_size = inputs.size(0)
        num_steps = inputs.size(1)        
        # embeds is of shape batch_size * num_steps * emb_dim and is the input to lstm layer
        embeds = self.emb_layer(inputs)        
        # pack_padded_sequence before feeding into LSTM. This is required so pytorch knows
        # which elements of the sequence are padded ones and ignore them in computation.
        # This step is done only after the embedding step
        embeds_pack = pack_padded_sequence(embeds, input_lengths, batch_first=True)
        # lstm_out is of shape batch_size * num_steps * hidden_size and contains the output
        # features (h_t) from the last layer of LSTM for each t
        # h_n is of shape [(num_layers*num_directions), batch_size, hidden_size] and contains the final hidden 
        # state for each sample in the batch i.e. hidden state at t_end for each of the LSTM layers. Note that for a
        # biRNN (num_directions=2) the hidden state consists for both the forward and backward hidden states at each time step,
        # that is why num_layers is multiplied by num_directions to get the final hidden state for each lstm layer
        # The last element in h_n indexed by -1 like h_n[-1, :, :] represents the forward final hidden state of last lstm layer
        # The last but one element in h_n indexed by -2 like h_n[-2, :, :] represents backward final hidden state of last lstm layer
        # The last but two element in h_n indexed by -2 like h_n[-3, :, :] represents the forward final hidden state of 
        # previous to last lstm layer and so on...
        # same for c_n as h_n except that it is the final cell state (Ct (memory cell) at t=t_end)
        lstm_out_pack, (h_n, c_n) = self.lstm_layer(embeds_pack)
        # unpack the output
        lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)                
        if self.is_bidirect:
            # For a birection LSTM the hidden state at each time step is a concatenation of the hidden
            # state from the forward pass (h_tfwd) and backward pass (h_tbwd). Also for a biRNN lstm_out is the 
            # hidden state from the last layer of LSTM for each t with shape (batch_size, num_steps, num_directions * hidden_size)
            # The hidden state at final time step has shape (batch_size, hidden_size * num_directions) and can be extracted like this
            lstm_out = lstm_out[:, -1, :]
            # If we want to extract both the individual hidden states at final time step for forward pass and backward pass
            # h_tfinal_fwd = lstm_out[:, -1, :hidden_size]
            # h_tfinal_fwd = lstm_out[:, -1, hidden_size:]
            
            # Another way to extract the last hidden state for the forward and backward lstm layers
            # in a BiRNN is to use h_n like this
            # h_tend_fwd = h_n[-1, :, :]
            # h_tend_bwd = h_n[-2, :, :]
            # lstm_out = torch.cat((h_tend_fwd, h_tend_bwd), dim=1)
            #print(f"lstm_out.shape = {lstm_out.shape}")
            # Now that we just consider the hidden state from the final time step of each lstm layer, the num_steps dimension
            # goes away
            # lstm_out.shape = [batch_size, hidden_size * num_directions]
        else:            
            # The output from lstm layer is the hidden state from last lstm layer with shape
            # of batch_size * hidden_dim. This can be extracted directly from h_n as below, -1 representing the last
            # last lstm_layer       
            lstm_out = h_n[-1, :, :]  
            # Another way to extract the final hidden state from the last lstm layer for unidirectional LSTM
            # lstm_out = lstm_out[:, -1, :]    
            # lstm_out.shape = [batch_size, hidden_dim]

            # The below logic is dubious and needs a relook  
            # or we can extract it from lstm_out and lstm_out_len. lstm_out is of shape
            # batch_size * num_steps * hidden_dim. Now num_steps is the max sequence length
            # in the batch, but for items in batch for which sequence length < max sequence length
            # we need to take the element at lstm_out_len - 1 position in dimension 2 
            # as elements after it are padded and should be ignored. Thus instead of num_steps
            # if for each batch item we pick the element at (lstm_out_len - 1) index we get
            # lstm_out in the shape batch_size * hidden_dim
            # lstm_out = [lstm_out[batch_item_index, seq_length_index, :] for batch_item_index, seq_length_index in enumerate(lstm_out_len)]
            # lstm_out = torch.cat(lstm_out, dim=0).reshape(batch_size, self.emb_size)
        
        # regularize lstm output by applying dropout
        out = self.dropout(lstm_out)        
        # The the output Y of fully connected rnn layer has the shape of 
        # (`num_steps` * `batch_size`, `num_hiddens`). This Y is then fed as input to the 
        # output fully connected linear layer which produces the prediction in the output shape of 
        # (`num_steps` * `batch_size`, `output_dim`).        
        output = self.linear(out)        
        # apply sigmoid activation to convert output to probability 
        output = self.act(output)
        return output, (h_n, c_n)

    def init_state(self, device, batch_size=1):
        """ Initialize the hidden state i.e. initialize all the neurons in all the hidden layers 
        to zero"""
        if not isinstance(self.lstm_layer, nn.LSTM):
            # `nn.GRU` takes a tensor as hidden state
            # For a biRNN the we have two initial hidden states, one for the forward direction and one for the backward direction
            # And each RNN layer has its own initial hidden state, hence num_directions * num_layers is the total number of initial 
            # hidden states for the network
            return torch.zeros((self.num_directions * self.num_layers,
                                batch_size, self.hidden_dim), device=device)
        else:
            # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
            # hidden state for each element in the batch, c0 = initial cell state
            # for each element in the batch
            return (torch.zeros((self.num_directions * self.num_layers,
                                 batch_size, self.hidden_dim), device=device),
                    torch.zeros((self.num_directions * self.num_layers,
                                 batch_size, self.hidden_dim), device=device))

In [None]:
class ModelExecutionEngine:
    def __init__(self, model, optimizer, device, batch_size, grad_clip, eval_metric="accuracy"):
        self.model = model
        self.optimizer = optimizer
        self.device = device
        self.batch_size = batch_size
        self.grad_clip = grad_clip
        self.eval_metric = eval_metric

    @staticmethod
    def loss_fn(outputs, targets):
        return nn.BCELoss()(outputs, targets)

    # metric to measure model performance
    def model_eval_metric(self, outputs, targets):
        # y_pred is in the range 0 to 1. Convert it to 0 or 1 by rounding
        outputs_round = torch.round(outputs.squeeze())
        correct = (outputs_round == targets.squeeze()).float()
        metric = 0.0
        if self.eval_metric.lower() == "accuracy":
            metric = correct.sum() / len(correct)
        elif self.eval_metric.lower() == "f1_score":
            metric = metrics.f1_score(targets.squeeze(), outputs_round)
        return metric

    def train_epoch(self, data_loader, init_hidden):
        self.model.train()
        loss_epoch = []
        metric_epoch = [] 
        for inputs, input_lengths, labels in data_loader:
            # create a new hidden state instance for each minibatch to avoid long gradient chains
            # involving all previous minibatches of an epoch
            h = tuple([e.data for e in init_hidden])
            inputs = inputs.to(self.device)
            input_lengths = input_lengths.to(self.device)
            labels = labels.to(self.device)
            # the last samples that do not fit in a batch will be discarded
            if inputs.shape[0] != self.batch_size:
                continue        
            # forward pass on one mini batch
            output, hidden = self.model(inputs, input_lengths, h)        
            # compute the loss
            loss = self.loss_fn(output.squeeze(), labels.float())
            metric = self.model_eval_metric(output, labels)        
            metric_epoch.append(metric.item())
            loss_epoch.append(loss.item())
            # zero out the model param (W and b) gradients before running backprop on this batch 
            # otherwise gradients will keep on aggregating from one batch to next
            self.optimizer.zero_grad()
            # run backprop to calculate param gradients (dW and db)
            loss.backward()
            # clip the param gradients if they exceed threshold
            nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip)
            # update the parameters (W and b)
            self.optimizer.step()            
        return np.mean(loss_epoch), np.mean(metric_epoch)  

    def evaluate_epoch(self, data_loader, init_hidden):
        self.model.eval()
        loss_epoch = []
        metric_epoch = [] 
        with torch.no_grad():
            for inputs, input_lengths, labels in data_loader:
                h = tuple([e.data for e in init_hidden])
                inputs = inputs.to(self.device)
                input_lengths = input_lengths.to(self.device)
                labels = labels.to(self.device)
                if inputs.shape[0] != self.batch_size:
                    continue                    
                output, hidden = self.model(inputs, input_lengths, h)        
                loss = self.loss_fn(output.squeeze(), labels.float())
                metric = self.model_eval_metric(output, labels)
                metric_epoch.append(metric.item())
                loss_epoch.append(loss.item())            
        return np.mean(loss_epoch), np.mean(metric_epoch)  

In [150]:
def get_exec_time(exec_time_secs):
    if exec_time_secs < 60:
        return f"{round(exec_time_secs, 2)} seconds"
    exec_time_rem_sec = exec_time_secs % 60
    exec_time_min = int((exec_time_secs - exec_time_rem_sec) / 60)
    return f"{exec_time_min} min {round(exec_time_rem_sec, 2)} seconds"

def print_epoch_stats(epoch, epoch_run_time, train_loss, train_metric, val_loss, val_metric):
    print(f"=======================================================")
    print(f"Epoch {epoch} execution time = {epoch_run_time}:")    
    print(f"Training loss = {round(train_loss, 4)}, training accuracy = {round(train_metric, 4)}")    
    print(f"Validation loss = {round(val_loss, 4)}, validation accuracy = {round(val_metric, 4)}")        
    print(f"=======================================================")        

In [160]:
# model constants
VOCAB_SIZE = len(tweet_vocab)
EMB_DIM = 200    
OUT_DIM = 1    
BATCH_SIZE = 128
GRAD_CLIP = 5
NUM_FOLDS = 5
NUM_EPOCHS = 20 
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    
MODEL_EVAL_METRIC = "accuracy"
# model hyperparameters
model_params = {
    "hidden_dim": 141, 
    "num_layers": 2, 
    "is_bidirectional": True, 
    "drop_out": 0.4258,
    "learning_rate": 0.000366
    }

In [152]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    X_train = train_df["vectorized_tweet"].to_numpy()
    y_train = train_df["target"].to_numpy()
    X_valid = valid_df["vectorized_tweet"].to_numpy()
    y_valid = valid_df["target"].to_numpy()
    ds_train = VectorizedTweetDataSet(X_train, y_train)
    ds_valid = VectorizedTweetDataSet(X_valid, y_valid)
    dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
    dl_valid = DataLoader(ds_valid, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
    return dl_train, dl_valid

In [161]:
def run_training(fold, df, params, save_model=False):
    train_dl, val_dl = get_fold_dls(fold, df)
    val_loss_min = np.Inf   
    model = DisasterModel(
                vocab_size = VOCAB_SIZE, 
                emb_dim = EMB_DIM, 
                out_dim = OUT_DIM, 
                pt_emb_weights = pt_emb_weights,
                num_layers = params["num_layers"], 
                is_bidirect = params["is_bidirectional"],  
                hidden_dim = params["hidden_dim"], 
                drop_prob = params["drop_out"]).to(DEVICE)
    #print(model)
    print(f"Running trainig for fold {fold}")
    optimizer = optim.Adam(model.parameters(), lr=params["learning_rate"])
    init_hidden = model.init_state(device = DEVICE, batch_size = BATCH_SIZE)        
    mee = ModelExecutionEngine(
            model=model, 
            optimizer=optimizer, 
            device=DEVICE, 
            batch_size=BATCH_SIZE,
            grad_clip=GRAD_CLIP,            
            eval_metric = MODEL_EVAL_METRIC
            )
    # number of epoch iterations with the validation loss not decreasing 
    # before training process is terminated without completing the total number of epochs             
    early_stopping_iter = 10            
    early_stopping_counter = 0     
    model_exec_stats = {
        "all_train_loss": [],
        "all_train_metric": [],
        "all_val_loss": [],
        "all_val_metric": []
        }       
    for epoch in range(NUM_EPOCHS):
        train_start_time = time.time()
        train_loss, train_metric = mee.train_epoch(train_dl, init_hidden)
        model_exec_stats["all_train_loss"].append(train_loss)
        model_exec_stats["all_train_metric"].append(train_metric)        
        val_loss, val_metric = mee.evaluate_epoch(val_dl, init_hidden)
        model_exec_stats["all_val_loss"].append(val_loss)
        model_exec_stats["all_val_metric"].append(val_metric)
        val_end_time = time.time()
        epoch_run_time = get_exec_time(val_end_time - train_start_time)
        print_epoch_stats(epoch, epoch_run_time, train_loss, train_metric, val_loss, val_metric)  
        if val_loss < val_loss_min:                        
            if save_model:
                print(f"Validation loss decreased from " +
                f"{round(val_loss_min, 6)} --> {round(val_loss, 6)}. Saving model...")                
                torch.save(model, "best_model.pt")                
            val_loss_min = val_loss
        else:
            early_stopping_counter += 1
        if early_stopping_counter > early_stopping_iter:
            break
    return val_loss_min, model_exec_stats

In [154]:
# from torch.utils.data import Subset
# from sklearn.model_selection import train_test_split

# tweet_ds = VectorizedTweetDataSet(df_train["vectorized_tweet"].values, df_train["target"].values)
# # split the tweet_ds into train and validation datasets with 80:20 ratio
# train_idx, val_idx = train_test_split(list(range(len(tweet_ds))), test_size=0.2, random_state=42)
# train_ds = Subset(tweet_ds, train_idx)
# val_ds = Subset(tweet_ds, val_idx)
# train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
# val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)

In [155]:
import matplotlib.pyplot as plt

def plot_train_val_metrics(ax, train_metric, val_metric, metric_name):
    ax.plot(train_metric, label = f"training {metric_name}")
    ax.plot(val_metric, label = f"validation {metric_name}")
    ax.set_xlabel("epochs")
    ax.set_ylabel(metric_name)
    ax.set_title(f"{metric_name} vs epochs")
    ax.legend()
    ax.grid()

In [156]:
# train_loss, train_acc, val_loss, val_acc = training_loop(
#                                             train_dl, val_dl, model, loss_fn, optimizer, 
#                                             num_epochs=35, batch_size=batch_size
#                                             )

In [162]:
val_loss_min_folds = []
metric_min_val_loss_folds = []
for fold in range(NUM_FOLDS):
    val_loss_min, model_exec_stats = run_training(fold, df_train, params=model_params, save_model=True)
    min_loss_index = np.argmin(model_exec_stats["all_val_loss"])
    # metric corresponding to the minimum validation loss epoch 
    metric_min_val_loss = model_exec_stats["all_val_metric"][min_loss_index]
    val_loss_min_folds.append(val_loss_min)
    metric_min_val_loss_folds.append(metric_min_val_loss)

Running trainig for fold 0
Epoch 0 execution time = 42.55 seconds:
Training loss = 0.6223, training accuracy = 0.6469
Validation loss = 0.5302, validation accuracy = 0.7415
Validation loss decreased from inf --> 0.530228. Saving model...
Epoch 1 execution time = 42.28 seconds:
Training loss = 0.4676, training accuracy = 0.7944
Validation loss = 0.4515, validation accuracy = 0.7947
Validation loss decreased from 0.530228 --> 0.451535. Saving model...
Epoch 2 execution time = 42.91 seconds:
Training loss = 0.4324, training accuracy = 0.8088
Validation loss = 0.4381, validation accuracy = 0.8089
Validation loss decreased from 0.451535 --> 0.438099. Saving model...
Epoch 3 execution time = 44.21 seconds:
Training loss = 0.4196, training accuracy = 0.8165
Validation loss = 0.4245, validation accuracy = 0.8175
Validation loss decreased from 0.438099 --> 0.424528. Saving model...
Epoch 4 execution time = 35.87 seconds:
Training loss = 0.4044, training accuracy = 0.8243
Validation loss = 0.420

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# plot_train_val_metrics(ax1, model_exec_stats["all_train_loss"], model_exec_stats["all_val_loss"], "Loss")
# plot_train_val_metrics(ax2, model_exec_stats["all_train_metric"], model_exec_stats["all_val_metric"], "Accuracy")

In [163]:
print("Minimum validation loss across cross validation folds:")
print(val_loss_min_folds)
print(f"{MODEL_EVAL_METRIC} across cv folds:")
print(metric_min_val_loss_folds)

Minimum validation loss across cross validation folds:
[0.42040781541304156, 0.4233617051081224, 0.42806999520822003, 0.422501956874674, 0.4417589388110421]
accuracy across cv folds:
[0.8117897727272727, 0.8181818181818182, 0.8167613636363636, 0.8167613636363636, 0.8089488636363636]


In [164]:
best_model = torch.load("best_model.pt")
print(best_model)
tweet_vocab.set_default_index(0)
df_test["vectorized_tweet"] = df_test["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )

# Do prediction with best performing model on the test set
def predict(df_test):
    test_output = []
    for index, row in df_test.iterrows():    
        vec_tweet = row["vectorized_tweet"]
        if len(vec_tweet) == 0:
            test_output.append(0)
            continue
        vec_tweet_len = torch.IntTensor([len(vec_tweet)])
        vec_tweet = vec_tweet.view(1, -1)    
        #print(vec_tweet, vec_tweet_len)
        output, (h_n,c_n) = best_model(vec_tweet, vec_tweet_len, state=None)
        #print(output)
        test_output.append(round(output.item()))    
    return test_output        

test_output = predict(df_test)
print(len(test_output))

df_submission = pd.read_csv('./data/submission.csv')
df_submission['target']= test_output
df_submission.to_csv('my_submission.csv',index=False)

DisasterModel(
  (emb_layer): Embedding(17120, 200)
  (lstm_layer): LSTM(200, 141, num_layers=2, batch_first=True, dropout=0.4258, bidirectional=True)
  (dropout): Dropout(p=0.4258, inplace=False)
  (linear): Linear(in_features=564, out_features=1, bias=True)
  (act): Sigmoid()
)
3263


In [121]:
# Wrapper method to run training for hyperparameter optimization as in this case the function 
# to be optimized needs to return one float value
def hyperparam_tune_run(train_dl, val_dl, params):
    min_val_loss, _ = run_training(train_dl, val_dl, params)
    return min_val_loss

In [122]:
import optuna
import time

#[I 2021-10-21 12:06:44,242] Trial 15 finished with value: 0.4088201341421708 and parameters: 
# {'hidden_dim': 141, 'drop_out': 0.4257934114073623, 'learning_rate': 0.0003660548388149779, 
# 'num_layers': 2}. Best is trial 15 with value: 0.4088201341421708.

def objective(trial):
    params = {
        "hidden_dim": trial.suggest_int("hidden_dim", 32, 512),
        "drop_out": trial.suggest_uniform("drop_out", 0.2, 0.7),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-3),
        #"is_bidirectional": trial.suggest_int("is_bidirectional", 0, 1),
        "num_layers": trial.suggest_int("num_layers", 1, 2)
    }
    loss = hyperparam_tune_run(train_dl, val_dl, params)
    trial_num = trial.number
    print(f"loss at end of trial {trial_num} execution = {loss}")
    print(f"trial {trial_num} params = {trial.params}")
    return loss

study = optuna.create_study(direction="minimize", study_name="DisasterModelTuning")    
study.optimize(objective, n_trials=20)
print("Best trial:")
print(study.best_params)

#Best trial:
#{'hidden_dim': 141, 'drop_out': 0.4257934114073623, 'learning_rate': 0.0003660548388149779, 'num_layers': 2}

[32m[I 2021-10-21 17:42:51,187][0m A new study created in memory with name: DisasterModelTuning[0m


Epoch 0 :
Execution time on training set = 11.18 seconds 
Training loss = 0.6908, training accuracy = 0.5331
Execution time on validation set = 0.62 seconds 
Validation loss = 0.6884, validation accuracy = 0.6122
Epoch 1 :
Execution time on training set = 10.21 seconds 
Training loss = 0.6875, training accuracy = 0.5687
Execution time on validation set = 0.58 seconds 
Validation loss = 0.6852, validation accuracy = 0.6243
Epoch 2 :
Execution time on training set = 10.18 seconds 
Training loss = 0.6833, training accuracy = 0.5961
Execution time on validation set = 0.61 seconds 
Validation loss = 0.682, validation accuracy = 0.6186
Epoch 3 :
Execution time on training set = 10.4 seconds 
Training loss = 0.6816, training accuracy = 0.6012
Execution time on validation set = 0.56 seconds 
Validation loss = 0.6798, validation accuracy = 0.6158
Epoch 4 :
Execution time on training set = 10.02 seconds 
Training loss = 0.6784, training accuracy = 0.6109
Execution time on validation set = 0.58 s

[32m[I 2021-10-21 17:46:33,787][0m Trial 0 finished with value: 0.6011451211842623 and parameters: {'hidden_dim': 216, 'drop_out': 0.5405437552245265, 'learning_rate': 6.970240235440715e-06, 'num_layers': 1}. Best is trial 0 with value: 0.6011451211842623.[0m


Epoch 19 :
Execution time on training set = 12.39 seconds 
Training loss = 0.6085, training accuracy = 0.7151
Execution time on validation set = 0.63 seconds 
Validation loss = 0.6011, validation accuracy = 0.7379
loss at end of trial 0 execution = 0.6011451211842623
trial 0 params = {'hidden_dim': 216, 'drop_out': 0.5405437552245265, 'learning_rate': 6.970240235440715e-06, 'num_layers': 1}




Epoch 0 :
Execution time on training set = 26.76 seconds 
Training loss = 0.6721, training accuracy = 0.5926
Execution time on validation set = 1.22 seconds 
Validation loss = 0.6448, validation accuracy = 0.6236
Epoch 1 :
Execution time on training set = 24.75 seconds 
Training loss = 0.592, training accuracy = 0.7051
Execution time on validation set = 1.19 seconds 
Validation loss = 0.5336, validation accuracy = 0.7543
Epoch 2 :
Execution time on training set = 25.31 seconds 
Training loss = 0.4956, training accuracy = 0.7763
Execution time on validation set = 1.3 seconds 
Validation loss = 0.467, validation accuracy = 0.8004
Epoch 3 :
Execution time on training set = 25.5 seconds 
Training loss = 0.4642, training accuracy = 0.7949
Execution time on validation set = 1.09 seconds 
Validation loss = 0.4633, validation accuracy = 0.7976
Epoch 4 :
Execution time on training set = 25.5 seconds 
Training loss = 0.4463, training accuracy = 0.8032
Execution time on validation set = 1.09 seco

KeyboardInterrupt: 