### Text classification using attention (Seq to One)
We use a bidirectional LSTM as encoder and an attention layer

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchtext
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn import model_selection
from sklearn import metrics
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import re
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import pytorch_lightning as pl
from torch.nn.functional import binary_cross_entropy_with_logits, binary_cross_entropy
from torchmetrics import Accuracy, F1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import string
import statistics
import nltk

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Configuration for training

In [3]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    VOCAB_SIZE = 0
    BATCH_SIZE = 512
    EMB_SIZE = 200
    OUT_SIZE = 2
    NUM_FOLDS = 5
    NUM_EPOCHS = 20
    NUM_WORKERS = 8
    # Whether to update the pretrained embedding weights during training process
    EMB_WT_UPDATE = True
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False    
    PATIENCE = 6    
    IS_BIDIRECTIONAL = True
    # model hyperparameters
    MODEL_HPARAMS = {
        "hidden_size": 141, 
        "num_layers": 2,         
        "drop_out": 0.4258,
        "lr": 0.000366,
        "weight_decay": 0.00001
    }

# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(42, workers=True)

Global seed set to 42


42

### Load the data

In [4]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


### Some EDA

In [5]:
df_train_pos = df_train[df_train.target == 1]
df_train_neg = df_train[df_train.target == 0]
print(f"No. of positive training examples = {len(df_train_pos)}")
print(f"No. of negative training examples = {len(df_train_neg)}")
train_keywords_unique = df_train.keyword.unique()
print(f"No. of unique keywords = {len(train_keywords_unique)}")
df_train_notnull_keywords = df_train[~df_train.keyword.isnull()]
print(f"No of train examples with keyword not null = {len(df_train_notnull_keywords)}")

No. of positive training examples = 3271
No. of negative training examples = 4342
No. of unique keywords = 222
No of train examples with keyword not null = 7552


### K Fold CV

In [6]:
# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced
def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    # get the target data
    y = df["target"].values
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df        

df_train = strat_kfold_dataframe(df_train, target_col_name="target", num_folds=5)    

### Tweet preprocessing

In [7]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE
        #tweet = tweet + " " + keyword
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            # remove stopwords and punctuation
            #if (word.isalpha() and len(word) > 2 and word not in stop 
            #    and word not in string.punctuation):
                #stem_word = stemmer.stem(word)  # stemming word            
                #lem_word = lemmatizer.lemmatize(word)
                #tweets_clean.append(lem_word) 
                tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [8]:
# Fill in missing values
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")
# length of the processed tweet
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))
df_train.iloc[50:52, :]

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
50,5174,fatalities,Official Website,#HSE releases annual workplace facilities data. Have a look | http://t.co/h4UshEekxm http://t.co/jNHNX3oISN,0,4,hse releases annual workplace facilities data have a look,9
51,3126,debris,,#??? #?? #??? #??? MH370: Aircraft debris found on La Reunion is from missing Malaysia Airlines ... http://t.co/zxCORQ0A3a,1,0,mh370 aircraft debris found on la reunion is from missing malaysia airlines,12


### Model building starts from here

In [9]:
# Load the GloVe word embedding for tweets
glove_emb = torchtext.vocab.GloVe(name="twitter.27B", dim=200)

In [10]:
# build tweets vocab from training data
def yield_tokens(df):
    for index, row in df.iterrows():
        yield row["processed_text"].split()
    
tweet_vocab = build_vocab_from_iterator(yield_tokens(df_train), specials=["<unk>", "<pad>"])   
Config.VOCAB_SIZE = len(tweet_vocab)

In [11]:
# For the problem specific vocab, get the embedding vectors from the pre-trained embedding
# for each word in vocab and return a matrix of shape vocab_size, embedding_dim. This matrix
# will be the pretrained embedding weight matrix which we will use to create the embedding layer
def get_vocab_pt_emb_matrix(text_vocab, emb):
    embedding_matrix = []
    for token in text_vocab.get_itos():
        embedding_matrix.append(emb[token])
    return torch.stack(embedding_matrix)

pt_emb_weights = get_vocab_pt_emb_matrix(tweet_vocab, glove_emb)
pt_emb_layer = nn.Embedding.from_pretrained(pt_emb_weights)

In [12]:
# vectorize the processed tweet, i.e. replace each token in the tweet with its corresponding index
# in the tweet vocab
df_train["vectorized_tweet"] = df_train["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )

### Tweet dataset

In [13]:
class VectorizedTweetDataSet(Dataset):
    def __init__(self, tweet_vecs, labels):
        self.tweet_vecs = tweet_vecs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tweet_vec = self.tweet_vecs[idx]
        label = self.labels[idx]
        tweet_len = len(tweet_vec)
        return (tweet_vec, label)


### Get train and validation data for a fold

In [14]:
def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    X_train = train_df["vectorized_tweet"].to_numpy()
    y_train = train_df["target"].to_numpy()
    X_valid = valid_df["vectorized_tweet"].to_numpy()
    y_valid = valid_df["target"].to_numpy()
    ds_train = VectorizedTweetDataSet(X_train, y_train)
    ds_valid = VectorizedTweetDataSet(X_valid, y_valid)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    dl_valid = DataLoader(ds_valid, batch_size=Config.BATCH_SIZE, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    return dl_train, dl_valid

### Pad the input sequence

In [15]:
# If the goal is to train with mini-batches, one needs to pad the sequences in each batch. 
# In other words, given a mini-batch of size N, if the length of the largest sequence is L, 
# one needs to pad every sequence with a length of smaller than L with zeros and make their 
# lengths equal to L. Moreover, it is important that the sequences in the batch are in the 
# descending order.
def pad_collate(batch):
    # Each element in the batch is a tuple (data, label)
    # sort the batch (based on tweet word count) in descending order
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    # Also need to store the length of each sequence.This is later needed in order to unpad 
    # the sequences
    seq_len = torch.Tensor([len(x) for x in sequences])
    labels = torch.Tensor([x[1] for x in sorted_batch])
    return sequences_padded, seq_len, labels


## Model architecture

<img src="temp.jpg" style="width:800px;height:500px;">

### Build the model 
Encoder (biLSTM) => Attention layer => Fully connected layer => Sigmoid 

**Bidirectional RNN as encoder** <br>
outputs is of size [src len, batch size, hid dim * num directions] where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN. We can think of the third axis as being the forward and backward hidden states concatenated together other. 

hidden is of size [n layers * num directions, batch size, hid dim], where [-2, :, :] gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and [-1, :, :] gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).

The bidirectional rnn encoder returns the hidden state from each time step as well as the final hidden state (last time step). 

**Attention layer** <br>
Takes as input encoder outputs ( hidden state from each time step of last rnn layer) as well encoder final hidden state (fom the last time step)


In [16]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):    
    def __init__(self, params, hparams):
        super().__init__()                
        self.num_layers = hparams["num_layers"]
        self.hidden_size = hparams["hidden_size"]
        self.is_bidirect = params["is_bidirect"]
        self.num_directions = 2 if self.is_bidirect else 1
        # Embedding layer
        self.emb_layer = nn.Embedding(params["vocab_size"], params["emb_size"])
        # copy the vocab specific weights(emb vectors) from pretrained embeddings to model embedding layer
        self.emb_layer.weight.data.copy_(params["pt_emb_weights"])
        # whether to update the pretrained embedding layer weights during model training
        self.emb_layer.weight.requires_grad = params["emb_wt_update"] 
        # LSTM Layer        
        self.lstm_layer = nn.LSTM(
                        input_size=params["emb_size"], 
                        hidden_size=self.hidden_size, 
                        batch_first=True, 
                        bidirectional=self.is_bidirect, 
                        num_layers=self.num_layers, 
                        dropout=hparams["drop_out"]
                        )
        
    def forward(self, inputs, input_lengths, state):        
        # inputs = [batch_size, batch_max_seq_length]        
        # embeds is of shape batch_size * num_steps * emb_dim and is the input to lstm layer
        embeds = self.emb_layer(inputs)        
        # final hidden state (from last time step)
        h_final = None        
        # embeds = [batch_size, max_seq_length, emb_dim]        
        embeds_pack = pack_padded_sequence(embeds, input_lengths.to("cpu"), batch_first=True)                
        lstm_out_pack, (h_n, c_n) = self.lstm_layer(embeds_pack)
        # h_n and c_n = [num_directions * num_layers, batch_size, hidden_size]
        # unpack the output
        lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)        
        # print(f"lstm_out.shape = {lstm_out.shape}") # [batch_size, max_seq_length, hidden_size * num_directions]        
        if self.is_bidirect:                        
            h_tend_fwd = h_n[-2, :, :]
            h_tend_bwd = h_n[-1, :, :]
            h_final = torch.cat((h_tend_fwd, h_tend_bwd), dim=1)            
        else:                        
            h_final = h_n[-1, :, :]   

        # print(f"h_final.shape = {h_final.shape}") # [batch_size, hidden_size * num_directions]
        return lstm_out, lstm_out_len, h_final

    def init_state(self, batch_size=1):
        """ Initialize the hidden state i.e. initialize all the neurons in all the hidden layers 
        to zero"""
        if not isinstance(self.lstm_layer, nn.LSTM):
            # `nn.GRU` takes a tensor as hidden state
            return torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size))
        else:
            # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
            # hidden state for each element in the batch, c0 = initial cell state
            # for each element in the batch
            return (torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)),
                    torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)))

In [17]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size        
        self.attn = nn.Linear((hidden_size * 2) + (hidden_size * 2), hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, encoder_outputs, enc_final_hidden_state):        
        # print(f"encoder_outputs.shape = {encoder_outputs.shape}") #[batch_size, seq_length, enc_hidden_size * num_directions]
        # print(f"enc_final_hidden_state.shape = {enc_final_hidden_state.shape}") # [batch_size, enc_hidden_size * num_directions]
        batch_size, seq_length, _ = encoder_outputs.shape
        # add seq_length dim to enc_final_hiddden_state
        enc_final_hidden_state = enc_final_hidden_state.unsqueeze(1)
        # enc_final_hidden_state.shape = [batch_size, 1, enc_hidden_size * 2]
        # now repeat the final hidden state seq_length times across dim 1 so final hidden state and encoder outputs have same dimensions
        enc_final_hidden_state = enc_final_hidden_state.repeat(1, seq_length, 1)
        # print(f"enc_final_hidden_state.shape = {enc_final_hidden_state.shape}") # [batch_size, seq_length, enc_hidden_size * 2]
        # concat the enc final hidden state and the encoder outputs (which are nothing but enc hidden states at individual time steps)
        energy = torch.tanh(self.attn(torch.cat((encoder_outputs, enc_final_hidden_state), dim=2)))
        # print(f"energy.shape = {energy.shape}") # [batch_size, seq_length, enc_hidden_size]
        # get attention vector corresponding to each source time step
        attention = self.v(energy).squeeze(2)
        # print(f"(attention.shape = {attention.shape}") # [batch_size, seq_length]
        attn_weights = F.softmax(attention, dim=1)
        # print(f"(attn_weights.shape = {attn_weights.shape}") # [batch_size, seq_length]
        attn_weights = attn_weights.unsqueeze(1)
        # print(f"(attn_weights.shape = {attn_weights.shape}") # [batch_size, 1, seq_length]
        # apply attention weights to encoder outputs to get context vector
        context_vector = torch.bmm(attn_weights, encoder_outputs)
        # print(f"(context_vector.shape = {context_vector.shape}") # [batch_size, 1, enc_hidden_size * 2]
        return context_vector.squeeze(1), attn_weights

In [18]:
class RnnAttnClassifier(nn.Module):
    def __init__(self, params, hparams):
        super().__init__()
        self.encoder = Encoder(params, hparams)
        self.attention_layer = AttentionLayer(hparams["hidden_size"])
        self.fc = nn.Linear(hparams["hidden_size"] * 2, 2)
        self.act = nn.Sigmoid()

    def forward(self, inputs, input_lengths, state):
        enc_out, enc_out_len, enc_h_final = self.encoder(inputs, input_lengths, state)
        ctx_vec, attn_weights = self.attention_layer(enc_out, enc_h_final)
        out = self.fc(ctx_vec)
        return self.act(out)

    def init_state(self):
        return self.encoder.init_state()

### Pytorch lightning wrapper for model

In [19]:
class DisasterTweetLitModel(pl.LightningModule):
    def __init__(self, params, hparams, model_eval_metric=MODEL_EVAL_METRIC.accuracy):
        super().__init__()
        #self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.weight_decay = hparams["weight_decay"]
        self.model_eval_metric = model_eval_metric
        self.network = RnnAttnClassifier(params, hparams)            

    def forward(self, tweets, tweet_lengths, state):
        return self.network(tweets, tweet_lengths, state)

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, mode="min")
        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }

    def training_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        targets_pred = self(tweets, tweet_lengths, (h0, c0))        
        #print(f"targets_pred.shape = {targets_pred.shape}")
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        train_loss = binary_cross_entropy(targets_pred, loss_targets)
        train_metric = None
        train_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:            
            targets_pred = torch.argmax(targets_pred, dim=1)            
            train_metric = Accuracy(num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            train_metric_str = "train_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            train_metric = F1(targets_pred, targets)            
            train_metric_str = "train_f1"
        self.log("train_loss", train_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(train_metric_str, train_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        targets_pred = self(tweets, tweet_lengths, (h0, c0))
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        val_loss = binary_cross_entropy(targets_pred, loss_targets)
        val_metric = None
        val_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:
            targets_pred = torch.argmax(targets_pred, dim=1)
            val_metric = Accuracy(num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            val_metric_str = "val_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            val_metric = F1(targets_pred, targets)            
            val_metric_str = "val_f1"
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(val_metric_str, val_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return val_loss

### Custom lightning callback 
To record training and validation metric values at each epoch and the best metric values across all epochs

In [20]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer
# Monitor multiple metric values that are calculated either in training or validation step and return the
# best metric values for each epoch
class MetricsAggCallback(Callback):
    def __init__(self, train_metrics_to_monitor, val_metrics_to_monitor):
        # dictionary with metric name as key and monitor mode (min, max) as the value
        # ( the same names used to log metric values in training and validation step)
        self.val_metrics_to_monitor = val_metrics_to_monitor
        self.train_metrics_to_monitor = train_metrics_to_monitor
        # dictionary with metric_name as key and list of metric value for each epoch
        self.train_metrics = {metric: [] for metric in train_metrics_to_monitor.keys()}
        self.val_metrics = {metric: [] for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the best metric value for all epochs
        self.train_best_metric = {metric: None for metric in train_metrics_to_monitor.keys()}
        self.val_best_metric = {metric: None for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the epoch number with the best metric value
        self.train_best_metric_epoch = {metric: None for metric in train_metrics_to_monitor.keys()}     
        self.val_best_metric_epoch = {metric: None for metric in val_metrics_to_monitor.keys()}     
        self.epoch_counter = 0           

    @staticmethod
    def process_metrics(metrics_to_monitor, metrics, best_metric, best_metric_epoch, trainer):
        metric_str = ""
        for metric, mode in metrics_to_monitor.items():
            metric_value = round(trainer.callback_metrics[metric].cpu().detach().item(), 4)            
            metric_str += f"{metric} = {metric_value}, "
            metrics[metric].append(metric_value)
            if mode == "max":
                best_metric[metric] = max(metrics[metric])            
            elif mode == "min":            
                best_metric[metric] = min(metrics[metric])            
            best_metric_epoch[metric] = metrics[metric].index(best_metric[metric]) 
        print(metric_str[:-2])

    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        self.epoch_counter += 1        
        self.process_metrics(self.train_metrics_to_monitor, self.train_metrics, self.train_best_metric, self.train_best_metric_epoch, trainer)

    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):        
        print(f"For epoch {self.epoch_counter}")
        self.process_metrics(self.val_metrics_to_monitor, self.val_metrics, self.val_best_metric, self.val_best_metric_epoch, trainer)


In [None]:
model_params = {
        "vocab_size": Config.VOCAB_SIZE,
        "emb_size": Config.EMB_SIZE,
        "output_size": Config.OUT_SIZE,
        "pt_emb_weights": pt_emb_weights,
        "emb_wt_update": Config.EMB_WT_UPDATE,
        "is_bidirect": True
    }

In [21]:
def run_training(fold, dl_train, dl_val, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")    
    disaster_tweet_model = DisasterTweetLitModel(
        params=model_params,        
        hparams=Config.MODEL_HPARAMS,
        model_eval_metric=Config.MODEL_EVAL_METRIC                
        )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    chkpt_file_name = fold_str + "_best_model_{epoch}_{val_loss:.4f}"
    train_metrics_to_monitor = {
        "train_loss": "min",
        "train_acc": "max"
    }
    val_metrics_to_monitor = {
        "val_loss": "min",
        "val_acc": "max",
        }
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)    
    metric_chkpt_callback = MetricsAggCallback(train_metrics_to_monitor, val_metrics_to_monitor)
    early_stopping_callback = EarlyStopping(monitor="val_loss", patience=Config.PATIENCE, mode="min", verbose=True)
    trainer = pl.Trainer(
        gpus = 1,
        deterministic = True,
        auto_select_gpus = True,
        progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        auto_lr_find = True,    
        #precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        gradient_clip_val = 1.0,        
        callbacks = [loss_chkpt_callback, metric_chkpt_callback, early_stopping_callback]
    )        
    if find_lr:
        trainer.tune(model=disaster_tweet_model, train_dataloaders=dl_train)
        print(disaster_tweet_model.lr)
    trainer.fit(disaster_tweet_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    fold_train_metrics = {
        metric: (metric_chkpt_callback.train_best_metric[metric], metric_chkpt_callback.train_best_metric_epoch[metric]) 
        for metric in train_metrics_to_monitor.keys()
    }
    fold_val_metrics = {
        metric: (metric_chkpt_callback.val_best_metric[metric], metric_chkpt_callback.val_best_metric_epoch[metric]) 
        for metric in val_metrics_to_monitor.keys()
    }            
    del trainer, disaster_tweet_model, loss_chkpt_callback, metric_chkpt_callback 
    return fold_train_metrics, fold_val_metrics

In [22]:
find_lr = True
all_fold_val_loss = []
all_fold_val_acc = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val = get_fold_dls(fold, df_train)
    fold_train_metrics, fold_val_metrics = run_training(fold, dl_train, dl_val, find_lr=False)    
    all_fold_val_loss.append(fold_val_metrics["val_loss"][0])
    all_fold_val_acc.append(fold_val_metrics["val_acc"][0])
    print(f"Best train metrics values for fold{fold}")    
    print(fold_train_metrics)
    print(f"Best val metrics values for fold{fold}")    
    print(fold_val_metrics)            

Running training for fold0


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | network | RnnAttnClassifier | 4.4 M 
----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.483    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.6955, val_acc = 0.4297


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.666
Epoch 0, global step 11: val_loss reached 0.66647 (best 0.66647), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=0_val_loss=0.6665.ckpt" as top 1


For epoch 0
val_loss = 0.6665, val_acc = 0.5653
train_loss = 0.6841, train_acc = 0.534


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.050 >= min_delta = 0.0. New best score: 0.616
Epoch 1, global step 23: val_loss reached 0.61607 (best 0.61607), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=1_val_loss=0.6161.ckpt" as top 1


For epoch 1
val_loss = 0.6161, val_acc = 0.7085
train_loss = 0.6429, train_acc = 0.6108


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.036 >= min_delta = 0.0. New best score: 0.580
Epoch 2, global step 35: val_loss reached 0.58018 (best 0.58018), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=2_val_loss=0.5802.ckpt" as top 1


For epoch 2
val_loss = 0.5802, val_acc = 0.7485
train_loss = 0.5978, train_acc = 0.7381


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.085 >= min_delta = 0.0. New best score: 0.496
Epoch 3, global step 47: val_loss reached 0.49556 (best 0.49556), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=3_val_loss=0.4956.ckpt" as top 1


For epoch 3
val_loss = 0.4956, val_acc = 0.7859
train_loss = 0.5407, train_acc = 0.7739


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.050 >= min_delta = 0.0. New best score: 0.445
Epoch 4, global step 59: val_loss reached 0.44512 (best 0.44512), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=4_val_loss=0.4451.ckpt" as top 1


For epoch 4
val_loss = 0.4451, val_acc = 0.8037
train_loss = 0.4519, train_acc = 0.8003


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.016 >= min_delta = 0.0. New best score: 0.429
Epoch 5, global step 71: val_loss reached 0.42925 (best 0.42925), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=5_val_loss=0.4292.ckpt" as top 1


For epoch 5
val_loss = 0.4292, val_acc = 0.8162
train_loss = 0.4106, train_acc = 0.8215


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.428
Epoch 6, global step 83: val_loss reached 0.42790 (best 0.42790), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=6_val_loss=0.4279.ckpt" as top 1


For epoch 6
val_loss = 0.4279, val_acc = 0.8201
train_loss = 0.387, train_acc = 0.832


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.007 >= min_delta = 0.0. New best score: 0.421
Epoch 7, global step 95: val_loss reached 0.42062 (best 0.42062), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=7_val_loss=0.4206.ckpt" as top 1


For epoch 7
val_loss = 0.4206, val_acc = 0.8201
train_loss = 0.3606, train_acc = 0.8461


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 107: val_loss was not in top 1


For epoch 8
val_loss = 0.4512, val_acc = 0.8024
train_loss = 0.3378, train_acc = 0.858


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 119: val_loss was not in top 1


For epoch 9
val_loss = 0.4362, val_acc = 0.8168
train_loss = 0.317, train_acc = 0.8677


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 131: val_loss was not in top 1


For epoch 10
val_loss = 0.4558, val_acc = 0.8148
train_loss = 0.2906, train_acc = 0.8816


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 143: val_loss was not in top 1


For epoch 11
val_loss = 0.4804, val_acc = 0.8142
train_loss = 0.2664, train_acc = 0.8928


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 155: val_loss was not in top 1


For epoch 12
val_loss = 0.5062, val_acc = 0.803
train_loss = 0.2442, train_acc = 0.9061


Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: 0.421. Signaling Trainer to stop.
Epoch 13, global step 167: val_loss was not in top 1


For epoch 13
val_loss = 0.5529, val_acc = 0.7978
train_loss = 0.2175, train_acc = 0.919
Best train metrics values for fold0
{'train_loss': (0.2175, 13), 'train_acc': (0.919, 13)}
Best val metrics values for fold0
{'val_loss': (0.4206, 8), 'val_acc': (0.8201, 7)}


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | network | RnnAttnClassifier | 4.4 M 
----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.483    Total estimated model params size (MB)


Running training for fold1


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.6901, val_acc = 0.585


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.667
Epoch 0, global step 11: val_loss reached 0.66696 (best 0.66696), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=0_val_loss=0.6670.ckpt" as top 1


For epoch 0
val_loss = 0.667, val_acc = 0.5706
train_loss = 0.6802, train_acc = 0.5703


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.062 >= min_delta = 0.0. New best score: 0.605
Epoch 1, global step 23: val_loss reached 0.60459 (best 0.60459), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=1_val_loss=0.6046.ckpt" as top 1


For epoch 1
val_loss = 0.6046, val_acc = 0.7229
train_loss = 0.6402, train_acc = 0.6059


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.064 >= min_delta = 0.0. New best score: 0.540
Epoch 2, global step 35: val_loss reached 0.54019 (best 0.54019), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=2_val_loss=0.5402.ckpt" as top 1


For epoch 2
val_loss = 0.5402, val_acc = 0.7761
train_loss = 0.5829, train_acc = 0.7678


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.081 >= min_delta = 0.0. New best score: 0.459
Epoch 3, global step 47: val_loss reached 0.45938 (best 0.45938), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=3_val_loss=0.4594.ckpt" as top 1


For epoch 3
val_loss = 0.4594, val_acc = 0.8004
train_loss = 0.4898, train_acc = 0.7952


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.022 >= min_delta = 0.0. New best score: 0.437
Epoch 4, global step 59: val_loss reached 0.43702 (best 0.43702), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=4_val_loss=0.4370.ckpt" as top 1


For epoch 4
val_loss = 0.437, val_acc = 0.803
train_loss = 0.4263, train_acc = 0.8169


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 0.425
Epoch 5, global step 71: val_loss reached 0.42532 (best 0.42532), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold1_best_model_epoch=5_val_loss=0.4253.ckpt" as top 1


For epoch 5
val_loss = 0.4253, val_acc = 0.8155
train_loss = 0.395, train_acc = 0.8296


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 83: val_loss was not in top 1


For epoch 6
val_loss = 0.4331, val_acc = 0.8129
train_loss = 0.368, train_acc = 0.8443


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 95: val_loss was not in top 1


For epoch 7
val_loss = 0.43, val_acc = 0.8148
train_loss = 0.3473, train_acc = 0.8527


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 107: val_loss was not in top 1


For epoch 8
val_loss = 0.4386, val_acc = 0.8142
train_loss = 0.3252, train_acc = 0.8608


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 119: val_loss was not in top 1


For epoch 9
val_loss = 0.4483, val_acc = 0.8135
train_loss = 0.304, train_acc = 0.8716


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 131: val_loss was not in top 1


For epoch 10
val_loss = 0.491, val_acc = 0.8043
train_loss = 0.279, train_acc = 0.8856


Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: 0.425. Signaling Trainer to stop.
Epoch 11, global step 143: val_loss was not in top 1
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | network | RnnAttnClassifier | 4.4 M 
----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.483    Total estimated model params size (MB)


For epoch 11
val_loss = 0.5749, val_acc = 0.803
train_loss = 0.2532, train_acc = 0.901
Best train metrics values for fold1
{'train_loss': (0.2532, 11), 'train_acc': (0.901, 11)}
Best val metrics values for fold1
{'val_loss': (0.4253, 6), 'val_acc': (0.8155, 6)}
Running training for fold2


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.6896, val_acc = 0.5713


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.661
Epoch 0, global step 11: val_loss reached 0.66097 (best 0.66097), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=0_val_loss=0.6610.ckpt" as top 1


For epoch 0
val_loss = 0.661, val_acc = 0.5706
train_loss = 0.6805, train_acc = 0.5703


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.066 >= min_delta = 0.0. New best score: 0.595
Epoch 1, global step 23: val_loss reached 0.59518 (best 0.59518), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=1_val_loss=0.5952.ckpt" as top 1


For epoch 1
val_loss = 0.5952, val_acc = 0.738
train_loss = 0.6393, train_acc = 0.6169


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.070 >= min_delta = 0.0. New best score: 0.525
Epoch 2, global step 35: val_loss reached 0.52494 (best 0.52494), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=2_val_loss=0.5249.ckpt" as top 1


For epoch 2
val_loss = 0.5249, val_acc = 0.7774
train_loss = 0.5813, train_acc = 0.7691


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.069 >= min_delta = 0.0. New best score: 0.456
Epoch 3, global step 47: val_loss reached 0.45633 (best 0.45633), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=3_val_loss=0.4563.ckpt" as top 1


For epoch 3
val_loss = 0.4563, val_acc = 0.7886
train_loss = 0.4904, train_acc = 0.7926


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.023 >= min_delta = 0.0. New best score: 0.433
Epoch 4, global step 59: val_loss reached 0.43293 (best 0.43293), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=4_val_loss=0.4329.ckpt" as top 1


For epoch 4
val_loss = 0.4329, val_acc = 0.8076
train_loss = 0.4314, train_acc = 0.8125


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.003 >= min_delta = 0.0. New best score: 0.430
Epoch 5, global step 71: val_loss reached 0.43010 (best 0.43010), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=5_val_loss=0.4301.ckpt" as top 1


For epoch 5
val_loss = 0.4301, val_acc = 0.8116
train_loss = 0.3971, train_acc = 0.8315


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: 0.428
Epoch 6, global step 83: val_loss reached 0.42818 (best 0.42818), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold2_best_model_epoch=6_val_loss=0.4282.ckpt" as top 1


For epoch 6
val_loss = 0.4282, val_acc = 0.8083
train_loss = 0.3715, train_acc = 0.8429


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 95: val_loss was not in top 1


For epoch 7
val_loss = 0.4363, val_acc = 0.8129
train_loss = 0.3479, train_acc = 0.8581


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 107: val_loss was not in top 1


For epoch 8
val_loss = 0.4327, val_acc = 0.8129
train_loss = 0.3309, train_acc = 0.866


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 119: val_loss was not in top 1


For epoch 9
val_loss = 0.4672, val_acc = 0.8043
train_loss = 0.3083, train_acc = 0.877


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 131: val_loss was not in top 1


For epoch 10
val_loss = 0.4548, val_acc = 0.8188
train_loss = 0.2832, train_acc = 0.8901


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 143: val_loss was not in top 1


For epoch 11
val_loss = 0.4857, val_acc = 0.8043
train_loss = 0.262, train_acc = 0.8998


Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: 0.428. Signaling Trainer to stop.
Epoch 12, global step 155: val_loss was not in top 1
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


For epoch 12
val_loss = 0.4963, val_acc = 0.8089
train_loss = 0.2358, train_acc = 0.9122
Best train metrics values for fold2
{'train_loss': (0.2358, 12), 'train_acc': (0.9122, 12)}
Best val metrics values for fold2
{'val_loss': (0.4282, 7), 'val_acc': (0.8188, 11)}
Running training for fold3


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | network | RnnAttnClassifier | 4.4 M 
----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.483    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.6957, val_acc = 0.4258


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.670
Epoch 0, global step 11: val_loss reached 0.66997 (best 0.66997), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=0_val_loss=0.6700.ckpt" as top 1


For epoch 0
val_loss = 0.67, val_acc = 0.5637
train_loss = 0.6829, train_acc = 0.5484


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.048 >= min_delta = 0.0. New best score: 0.622
Epoch 1, global step 23: val_loss reached 0.62233 (best 0.62233), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=1_val_loss=0.6223.ckpt" as top 1


For epoch 1
val_loss = 0.6223, val_acc = 0.7359
train_loss = 0.6425, train_acc = 0.6088


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.025 >= min_delta = 0.0. New best score: 0.598
Epoch 2, global step 35: val_loss reached 0.59766 (best 0.59766), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=2_val_loss=0.5977.ckpt" as top 1


For epoch 2
val_loss = 0.5977, val_acc = 0.7444
train_loss = 0.5922, train_acc = 0.7429


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.091 >= min_delta = 0.0. New best score: 0.506
Epoch 3, global step 47: val_loss reached 0.50633 (best 0.50633), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=3_val_loss=0.5063.ckpt" as top 1


For epoch 3
val_loss = 0.5063, val_acc = 0.772
train_loss = 0.5335, train_acc = 0.7682


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.055 >= min_delta = 0.0. New best score: 0.451
Epoch 4, global step 59: val_loss reached 0.45143 (best 0.45143), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=4_val_loss=0.4514.ckpt" as top 1


For epoch 4
val_loss = 0.4514, val_acc = 0.7989
train_loss = 0.4503, train_acc = 0.8033


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.022 >= min_delta = 0.0. New best score: 0.430
Epoch 5, global step 71: val_loss reached 0.42962 (best 0.42962), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold3_best_model_epoch=5_val_loss=0.4296.ckpt" as top 1


For epoch 5
val_loss = 0.4296, val_acc = 0.8029
train_loss = 0.4114, train_acc = 0.8202


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 83: val_loss was not in top 1


For epoch 6
val_loss = 0.4299, val_acc = 0.8101
train_loss = 0.3848, train_acc = 0.838


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 95: val_loss was not in top 1


For epoch 7
val_loss = 0.4302, val_acc = 0.8088
train_loss = 0.3637, train_acc = 0.845


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 107: val_loss was not in top 1


For epoch 8
val_loss = 0.4362, val_acc = 0.8062
train_loss = 0.3384, train_acc = 0.86


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 119: val_loss was not in top 1


For epoch 9
val_loss = 0.4414, val_acc = 0.8101
train_loss = 0.3167, train_acc = 0.8701


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 131: val_loss was not in top 1


For epoch 10
val_loss = 0.4651, val_acc = 0.8108
train_loss = 0.2926, train_acc = 0.8802


Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: 0.430. Signaling Trainer to stop.
Epoch 11, global step 143: val_loss was not in top 1


For epoch 11
val_loss = 0.5038, val_acc = 0.8049
train_loss = 0.268, train_acc = 0.8957
Best train metrics values for fold3
{'train_loss': (0.268, 11), 'train_acc': (0.8957, 11)}
Best val metrics values for fold3
{'val_loss': (0.4296, 6), 'val_acc': (0.8108, 11)}
Running training for fold4


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | network | RnnAttnClassifier | 4.4 M 
----------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.483    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.6897, val_acc = 0.5645


  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.660
Epoch 0, global step 11: val_loss reached 0.65971 (best 0.65971), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=0_val_loss=0.6597.ckpt" as top 1


For epoch 0
val_loss = 0.6597, val_acc = 0.5703
train_loss = 0.6793, train_acc = 0.5703


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.059 >= min_delta = 0.0. New best score: 0.601
Epoch 1, global step 23: val_loss reached 0.60099 (best 0.60099), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=1_val_loss=0.6010.ckpt" as top 1


For epoch 1
val_loss = 0.601, val_acc = 0.7543
train_loss = 0.6351, train_acc = 0.6278


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.059 >= min_delta = 0.0. New best score: 0.542
Epoch 2, global step 35: val_loss reached 0.54201 (best 0.54201), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=2_val_loss=0.5420.ckpt" as top 1


For epoch 2
val_loss = 0.542, val_acc = 0.7733
train_loss = 0.5761, train_acc = 0.7711


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.064 >= min_delta = 0.0. New best score: 0.478
Epoch 3, global step 47: val_loss reached 0.47772 (best 0.47772), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=3_val_loss=0.4777.ckpt" as top 1


For epoch 3
val_loss = 0.4777, val_acc = 0.7983
train_loss = 0.477, train_acc = 0.7935


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.017 >= min_delta = 0.0. New best score: 0.461
Epoch 4, global step 59: val_loss reached 0.46118 (best 0.46118), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=4_val_loss=0.4612.ckpt" as top 1


For epoch 4
val_loss = 0.4612, val_acc = 0.8035
train_loss = 0.4269, train_acc = 0.8069


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.008 >= min_delta = 0.0. New best score: 0.453
Epoch 5, global step 71: val_loss reached 0.45297 (best 0.45297), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=5_val_loss=0.4530.ckpt" as top 1


For epoch 5
val_loss = 0.453, val_acc = 0.8049
train_loss = 0.3965, train_acc = 0.8243


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.448
Epoch 6, global step 83: val_loss reached 0.44833 (best 0.44833), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold4_best_model_epoch=6_val_loss=0.4483.ckpt" as top 1


For epoch 6
val_loss = 0.4483, val_acc = 0.797
train_loss = 0.3716, train_acc = 0.8391


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 95: val_loss was not in top 1


For epoch 7
val_loss = 0.4555, val_acc = 0.7996
train_loss = 0.3489, train_acc = 0.8514


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 107: val_loss was not in top 1


For epoch 8
val_loss = 0.46, val_acc = 0.8075
train_loss = 0.3266, train_acc = 0.8624


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 119: val_loss was not in top 1


For epoch 9
val_loss = 0.461, val_acc = 0.8029
train_loss = 0.3067, train_acc = 0.8726


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 131: val_loss was not in top 1


For epoch 10
val_loss = 0.4931, val_acc = 0.8016
train_loss = 0.2872, train_acc = 0.8816


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 143: val_loss was not in top 1


For epoch 11
val_loss = 0.5084, val_acc = 0.7996
train_loss = 0.2598, train_acc = 0.8938


Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: 0.448. Signaling Trainer to stop.
Epoch 12, global step 155: val_loss was not in top 1


For epoch 12
val_loss = 0.5411, val_acc = 0.7996
train_loss = 0.2353, train_acc = 0.9056
Best train metrics values for fold4
{'train_loss': (0.2353, 12), 'train_acc': (0.9056, 12)}
Best val metrics values for fold4
{'val_loss': (0.4483, 7), 'val_acc': (0.8075, 9)}


In [23]:
#all_fold_val_loss = [x[0] for x in all_fold_val_loss]
#all_fold_val_acc = [x[0] for x in all_fold_val_acc]
print(f"val loss across folds = {all_fold_val_loss}")
print(f"val accuracy across folds = {all_fold_val_acc}")
mean_loss = statistics.mean(all_fold_val_loss)
mean_acc = statistics.mean(all_fold_val_acc)
std_loss = statistics.stdev(all_fold_val_loss)
std_acc = statistics.stdev(all_fold_val_acc)
print(f"mean val loss across folds = {mean_loss}, val loss stdev across fold = {std_loss}")
print(f"mean val accuracy across folds = {mean_acc}, val accuracy stdev across fold = {std_acc}")

val loss across folds = [0.4206, 0.4253, 0.4282, 0.4296, 0.4483]
val accuracy across folds = [0.8201, 0.8155, 0.8188, 0.8108, 0.8075]
mean val loss across folds = 0.4304, val loss stdev across fold = 0.010582296537141636
mean val accuracy across folds = 0.81454, val accuracy stdev across fold = 0.005327569802452162


In [24]:
best_model = DisasterTweetLitModel.load_from_checkpoint(
    checkpoint_path="./model/fold0_best_model_epoch=7_val_loss=0.4206.ckpt",
    )
print(best_model)
tweet_vocab.set_default_index(0)
df_test["vectorized_tweet"] = df_test["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )

# Do prediction with best performing model on the test set
def predict(df_test):
    test_output = []
    for index, row in df_test.iterrows():    
        vec_tweet = row["vectorized_tweet"]
        if len(vec_tweet) == 0:
            test_output.append(0)
            continue
        vec_tweet_len = torch.IntTensor([len(vec_tweet)])
        vec_tweet = vec_tweet.view(1, -1)    
        #print(vec_tweet, vec_tweet_len)
        output, (h_n,c_n) = best_model(vec_tweet, vec_tweet_len, state=None)
        #print(output)
        test_output.append(round(output.item()))    
    return test_output        

test_output = predict(df_test)
print(len(test_output))

df_submission = pd.read_csv('./data/submission.csv')
df_submission['target']= test_output
df_submission.to_csv('my_submission.csv',index=False)