In [1]:
import os

import numpy as np
import pandas as pd
import torch
import re
import nltk
import torch.nn as nn
import pytorch_lightning as pl
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import StratifiedKFold
from torchmetrics import Accuracy, F1
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from torch.nn import functional as F

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Configuration for training

In [3]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    TWEET_COL = "text"
    RANDOM_STATE = 42
    BATCH_SIZE = 16
    OUT_SIZE = 2
    NUM_FOLDS = 3
    NUM_EPOCHS = 3
    NUM_WORKERS = 8
    TRANSFORMER_CHECKPOINT = "bert-base-uncased"
    # The hidden_size of the output of the last layer of the transformer model used
    TRANSFORMER_OUT_SIZE = 768
    PAD_TOKEN_ID = 0
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False    
    PATIENCE = 5    
    IS_BIDIRECTIONAL = True
    # model hyperparameters
    MODEL_HPARAMS = {
        "learning_rate": 2e-5,
        "adam_epsilon": 1e-8,
        "weight_decay": 0.0,
        "warmup_steps": 0
    }

DATA_PATH = "./data/"

# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(Config.RANDOM_STATE, workers=True)

Global seed set to 42


42

### Load the data

In [4]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


### K Fold CV
Split the training dataframe into kfolds for cross validation. We do this before any processing is done
on the data. We use stratified kfold if the target distribution is unbalanced

In [5]:
def strat_kfold_dataframe(df, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df["target"].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

df_train = strat_kfold_dataframe(df_train, num_folds=Config.NUM_FOLDS)            

### Tweet preprocessing

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE        
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            # remove stopwords and punctuation
            #if (word.isalpha() and len(word) > 2 and word not in stop and word not in string.punctuation):
                #stem_word = stemmer.stem(word)  # stemming word            
                #lem_word = lemmatizer.lemmatize(word)
                #tweets_clean.append(lem_word) 
                tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [8]:
# Fill in missing values
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")
# length of the processed tweet
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))
df_train.iloc[50:52, :]

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
50,73,ablaze,"Sheffield Township, Ohio",Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k,1,1,deputies man shot before brighton home set ablaze,8
51,74,ablaze,India,Man wife get six years jail for setting ablaze niece\nhttp://t.co/eV1ahOUCZA,1,0,man wife get six years jail for setting ablaze niece,10


## Dataset for transformer model
Converts tweets into pytorch dataset compatible with BERT and other transformers

In [9]:
# Convert tweets to data that the BERT model understands
class TransformerTweetDataset(Dataset):
    def __init__(self, tweets, targets):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item_idx):
        inputs = self.tokenizer(self.tweets[item_idx])
        targets = self.targets[item_idx]
        return {
            "input_ids": torch.LongTensor(inputs["input_ids"]),
            "token_type_ids": torch.LongTensor(inputs["token_type_ids"]),
            "attention_mask": torch.LongTensor(inputs["attention_mask"]),
            "targets": targets
        }

### Pad the input data

In [10]:
def pad_input_seq(batch, key, padding_value):    
    seq = [dict_item[key] for dict_item in batch]    
    seq_sorted = sorted(seq, key=lambda x:x.shape[0], reverse=True)    
    return pad_sequence(seq_sorted, batch_first=True, padding_value=padding_value)

def pad_collate(batch):        
    # batch is a list of dictionaries where each each dictionary is one data row    
    input_ids_padded = pad_input_seq(batch, "input_ids", Config.PAD_TOKEN_ID)
    token_type_ids_padded = pad_input_seq(batch, "token_type_ids", Config.PAD_TOKEN_ID)
    attention_mask_padded = pad_input_seq(batch, "attention_mask", 0)
    batch_targets = torch.LongTensor([dict_item["targets"] for dict_item in batch])
    batch_padded = {
        "input_ids": input_ids_padded,
        "token_type_ids": token_type_ids_padded,
        "attention_mask": attention_mask_padded,
        "labels": batch_targets
    } 
    return batch_padded

### Get train and validation data for a fold

In [11]:
def get_fold_dls(fold, df, tweet_col="text"):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    X_train = train_df[tweet_col].values.tolist()
    y_train = train_df.target.values.tolist()
    X_valid = valid_df[tweet_col].values.tolist()
    y_valid = valid_df["target"].values.tolist()
    ds_train = TransformerTweetDataset(X_train, y_train)
    ds_valid = TransformerTweetDataset(X_valid, y_valid)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    dl_valid = DataLoader(ds_valid, batch_size=Config.BATCH_SIZE, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    return dl_train, dl_valid, ds_train, ds_valid

In [12]:
dl_train, dl_valid, ds_train, ds_valid = get_fold_dls(0, df_train, Config.TWEET_COL)

In [13]:
# my_dl = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=pad_collate, num_workers=1)

In [14]:
# dl_train_iter = iter(my_dl)
# item = next(dl_train_iter)

In [15]:
# targets = item["targets"]
# print(f"targets.shape={targets.shape}")
# labels = F.one_hot(targets.T.long(), num_classes=2)
# #labels = labels.float()        
# print(f"labels.shape = {labels.shape}")        

In [16]:
# my_tmodel = AutoModelForSequenceClassification.from_pretrained(Config.TRANSFORMER_CHECKPOINT, num_labels=2)

In [17]:
# output = my_tmodel(
#     input_ids = item["input_ids"], 
#     token_type_ids = item["token_type_ids"], 
#     attention_mask = item["attention_mask"], 
#     labels = targets.long()
#     )
# print(output.loss)
# print(output.logits.shape)

### Transformer model for tweet classification using pytorch lightning

In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

class TransformerTweetLitModel(pl.LightningModule):
    def __init__(self, params, hparams):
        super().__init__()
        self.save_hyperparameters()
        self.learning_rate = hparams["learning_rate"]     
        self.weight_decay = hparams["weight_decay"]
        self.adam_epsilon = hparams["adam_epsilon"]
        self.warmup_steps = hparams["warmup_steps"]   
        self.transformer_model = AutoModelForSequenceClassification.from_pretrained(
            Config.TRANSFORMER_CHECKPOINT, 
            num_labels=Config.OUT_SIZE
            )
        self.num_train_steps = params["num_train_steps"]
        self.model_eval_metric = params["model_eval_metric"]
        print(self.hparams)

    def forward(self, input_ids, token_type_ids, attention_mask, labels):
        return self.transformer_model(
            input_ids = input_ids,
            token_type_ids = token_type_ids,
            attention_mask = attention_mask,
            labels = labels
        )

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.transformer_model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.num_train_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):        
        # input_ids = batch["input_ids"]
        # token_type_ids = batch["token_type_ids"]
        # attention_mask = batch["attention_mask"]
        # targets = batch["targets"]
        output = self(**batch)     
        labels = batch["labels"]   
        train_metric = None
        train_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:    
            targets_pred = torch.argmax(output.logits, dim=1)            
            train_metric = Accuracy(num_classes=2)(targets_pred.cpu(), labels.cpu())
            train_metric_str = "train_acc"
        # elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
        #     train_metric = F1(output.logits, targets.float())            
        #     train_metric_str = "train_f1"
        self.log("train_loss", output.loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(train_metric_str, train_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return output.loss                    

    # def validation_step(self, batch, batch_idx):
    #     input_ids = batch["input_ids"]
    #     token_type_ids = batch["token_type_ids"]
    #     attention_mask = batch["attention_mask"]
    #     targets = batch["targets"]                
    #     output = self(input_ids, token_type_ids, attention_mask, targets)
    #     val_metric = None
    #     val_metric_str = ""
    #     if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:            
    #         targets_pred = torch.argmax(output.logits, dim=1)            
    #         val_metric = Accuracy(num_classes=2)(targets_pred.detach().cpu(), targets.detach().cpu())
    #         val_metric_str = "val_acc"
    #     # elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
    #     #     val_metric = F1(output.logits, targets.float())            
    #     #     val_metric_str = "val_f1"
    #     self.log("val_loss", output.loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
    #     self.log(val_metric_str, val_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
    #     return output.loss                
     
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]
        preds = torch.argmax(logits, axis=1)
        labels = batch["labels"]
        val_metric = None
        val_metric_str = ""
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:            
            targets_pred = torch.argmax(logits, dim=1)            
            val_metric = Accuracy(num_classes=2)(targets_pred.detach().cpu(), labels.detach().cpu())
            val_metric_str = "val_acc"
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(val_metric_str, val_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return {"loss": val_loss, "preds": preds, "labels": labels}   

    # def validation_epoch_end(self, outputs):        
    #     preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
    #     labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
    #     loss = torch.stack([x["loss"] for x in outputs]).mean()
    #     self.log("val_loss", loss, prog_bar=True)
    #     self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)
    #     return loss         

### Custom lightning callback 
To record training and validation metric values at each epoch and the best metric values across all epochs

In [26]:
from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer
# Monitor multiple metric values that are calculated either in training or validation step and return the
# best metric values for each epoch
class MetricsAggCallback(Callback):
    def __init__(self, train_metrics_to_monitor, val_metrics_to_monitor):
        # dictionary with metric name as key and monitor mode (min, max) as the value
        # ( the same names used to log metric values in training and validation step)
        self.val_metrics_to_monitor = val_metrics_to_monitor
        self.train_metrics_to_monitor = train_metrics_to_monitor
        # dictionary with metric_name as key and list of metric value for each epoch
        self.train_metrics = {metric: [] for metric in train_metrics_to_monitor.keys()}
        self.val_metrics = {metric: [] for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the best metric value for all epochs
        self.train_best_metric = {metric: None for metric in train_metrics_to_monitor.keys()}
        self.val_best_metric = {metric: None for metric in val_metrics_to_monitor.keys()}
        # dictionary with metric_name as key and the epoch number with the best metric value
        self.train_best_metric_epoch = {metric: None for metric in train_metrics_to_monitor.keys()}     
        self.val_best_metric_epoch = {metric: None for metric in val_metrics_to_monitor.keys()}     
        self.epoch_counter = 0           

    @staticmethod
    def process_metrics(metrics_to_monitor, metrics, best_metric, best_metric_epoch, trainer):
        metric_str = ""
        for metric, mode in metrics_to_monitor.items():
            metric_value = round(trainer.callback_metrics[metric].cpu().detach().item(), 4)            
            metric_str += f"{metric} = {metric_value}, "
            metrics[metric].append(metric_value)
            if mode == "max":
                best_metric[metric] = max(metrics[metric])            
            elif mode == "min":            
                best_metric[metric] = min(metrics[metric])            
            best_metric_epoch[metric] = metrics[metric].index(best_metric[metric]) 
        print(metric_str[:-2])

    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        self.epoch_counter += 1        
        self.process_metrics(self.train_metrics_to_monitor, self.train_metrics, self.train_best_metric, self.train_best_metric_epoch, trainer)

    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):        
        print(f"For epoch {self.epoch_counter}")
        self.process_metrics(self.val_metrics_to_monitor, self.val_metrics, self.val_best_metric, self.val_best_metric_epoch, trainer)


In [27]:
# num_train_steps1 = (len(df_train) / Config.BATCH_SIZE) * Config.NUM_EPOCHS
# num_train_steps2 = len(dl_train) * Config.NUM_EPOCHS
# num_val_steps = len(dl_valid) * Config.NUM_EPOCHS
# print(f"num_train_steps1 = {num_train_steps1}")
# print(f"num_train_steps2 = {num_train_steps2}")
# print(f"num_val_steps = {num_val_steps}")


### The training function

In [28]:
def run_training(fold, dl_train, dl_val, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    num_train_steps = int(len(dl_train) * Config.NUM_EPOCHS)
    print(f"num_train_steps = {num_train_steps}")
    disaster_tweet_model = TransformerTweetLitModel(                
        params = {
            "model_eval_metric": Config.MODEL_EVAL_METRIC,
            "num_train_steps": num_train_steps
        },
        hparams=Config.MODEL_HPARAMS
    )
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    chkpt_file_name = fold_str + "_best_model_{epoch}_{val_loss:.4f}"
    train_metrics_to_monitor = {
        "train_loss": "min",
        "train_acc": "max"
    }
    val_metrics_to_monitor = {
        "val_loss": "min",
        "val_acc": "max",
        }
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)    
    metric_chkpt_callback = MetricsAggCallback(train_metrics_to_monitor, val_metrics_to_monitor)
    early_stopping_callback = EarlyStopping(monitor="val_loss", patience=Config.PATIENCE, mode="min", verbose=True)
    trainer = pl.Trainer(
        gpus = 1,
        deterministic = True,
        auto_select_gpus = True,
        progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        auto_lr_find = True,    
        #precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        gradient_clip_val = 1.0,        
        callbacks = [loss_chkpt_callback, metric_chkpt_callback, early_stopping_callback]
    )        
    if find_lr:
        trainer.tune(model=disaster_tweet_model, train_dataloaders=dl_train)
        print(disaster_tweet_model.lr)
    trainer.fit(disaster_tweet_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    fold_train_metrics = {
        metric: (metric_chkpt_callback.train_best_metric[metric], metric_chkpt_callback.train_best_metric_epoch[metric]) 
        for metric in train_metrics_to_monitor.keys()
    }
    fold_val_metrics = {
        metric: (metric_chkpt_callback.val_best_metric[metric], metric_chkpt_callback.val_best_metric_epoch[metric]) 
        for metric in val_metrics_to_monitor.keys()
    }            
    best_model = loss_chkpt_callback.best_model_path
    del trainer, disaster_tweet_model, loss_chkpt_callback, metric_chkpt_callback 
    return fold_train_metrics, fold_val_metrics, best_model

In [29]:
find_lr = True
all_fold_val_loss = []
all_fold_val_acc = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val, ds_train, ds_val = get_fold_dls(fold, df_train, tweet_col=Config.TWEET_COL)
    fold_train_metrics, fold_val_metrics, chkpt_file_name = run_training(fold, dl_train, dl_val, find_lr=False)    
    all_fold_val_loss.append((fold_val_metrics["val_loss"][0], chkpt_file_name))
    all_fold_val_acc.append(fold_val_metrics["val_acc"][0])
    print(f"Best train metrics values for fold{fold}")    
    print(fold_train_metrics)
    print(f"Best val metrics values for fold{fold}")    
    print(fold_val_metrics)     
    break   

Running training for fold0
num_train_steps = 795


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

"hparams": {'learning_rate': 2e-05, 'adam_epsilon': 1e-08, 'weight_decay': 0.0, 'warmup_steps': 0}
"params":  {'model_eval_metric': 'accuracy', 'num_train_steps': 795}


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type                          | Params
--------------------------------------------------------------------
0 | transformer_model | BertForSequenceClassification | 109 M 
--------------------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


For epoch 0
val_loss = 0.702, val_acc = 0.4688


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.684
Epoch 0, global step 158: val_loss reached 0.68447 (best 0.68447), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=0_val_loss=0.6845.ckpt" as top 1


For epoch 0
val_loss = 0.6845, val_acc = 0.5701
train_loss = 0.6907, train_acc = 0.5513


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.001 >= min_delta = 0.0. New best score: 0.683
Epoch 1, global step 317: val_loss reached 0.68336 (best 0.68336), saving model to "/home/bk_anupam/code/ML/NLP/Kaggle/DisasterTweetsPrediction/model/fold0_best_model_epoch=1_val_loss=0.6834.ckpt" as top 1


For epoch 1
val_loss = 0.6834, val_acc = 0.5701
train_loss = 0.6858, train_acc = 0.5643


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Best train metrics values for fold0
{'train_loss': (0.6858, 1), 'train_acc': (0.5643, 1)}
Best val metrics values for fold0
{'val_loss': (0.6834, 2), 'val_acc': (0.5701, 1)}
