In [15]:
import os

import numpy as np
import pandas as pd
import torch
import re
import nltk
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, set_seed
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.model_selection import StratifiedKFold
from torch.nn import functional as F

In [16]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

### Configuration for training

In [17]:
class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    TWEET_COL = "processed_text"
    RANDOM_STATE = 42
    BATCH_SIZE = 16
    OUT_SIZE = 2
    NUM_FOLDS = 5
    NUM_EPOCHS = 4
    NUM_WORKERS = 8
    TRANSFORMER_CHECKPOINT = "bert-base-uncased"
    # The hidden_size of the output of the last layer of the transformer model used
    TRANSFORMER_OUT_SIZE = 768
    PAD_TOKEN_ID = 0
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False    
    PATIENCE = 5    
    IS_BIDIRECTIONAL = True
    # model hyperparameters
    MODEL_HPARAMS = {
        "lr": 0.00003,
    }

DATA_PATH = "./data/"

# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
set_seed(Config.RANDOM_STATE)
print(Config.DEVICE)

cuda


### Load the data

In [18]:
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


### K Fold CV
Split the training dataframe into kfolds for cross validation. We do this before any processing is done
on the data. We use stratified kfold if the target distribution is unbalanced

In [19]:
def strat_kfold_dataframe(df, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # randomize of shuffle the rows of dataframe before splitting is done
    df.sample(frac=1, random_state=Config.RANDOM_STATE).reset_index(drop=True)
    y = df["target"].values
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=Config.RANDOM_STATE)
    # stratification is done on the basis of y labels, a placeholder for X is sufficient
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_idx, "kfold"] = fold
    return df

df_train = strat_kfold_dataframe(df_train, num_folds=Config.NUM_FOLDS)            

### Tweet preprocessing

In [20]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bk_anupam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE        
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            # remove stopwords and punctuation
            #if (word.isalpha() and len(word) > 2 and word not in stop and word not in string.punctuation):
                #stem_word = stemmer.stem(word)  # stemming word            
                #lem_word = lemmatizer.lemmatize(word)
                #tweets_clean.append(lem_word) 
                tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [22]:
# Fill in missing values
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")
# length of the processed tweet
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))
df_train.iloc[50:52, :]

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
50,73,ablaze,"Sheffield Township, Ohio",Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k,1,2,deputies man shot before brighton home set ablaze,8
51,74,ablaze,India,Man wife get six years jail for setting ablaze niece\nhttp://t.co/eV1ahOUCZA,1,0,man wife get six years jail for setting ablaze niece,10


In [23]:
tokenizer = BertTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Dataset for transformer model
Converts tweets into pytorch dataset compatible with BERT and other transformers

In [24]:
# Convert tweets to data that the BERT model understands
class TransformerTweetDataset(Dataset):
    def __init__(self, tweets, targets, tokenizer, with_labels=True):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.with_labels = with_labels

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, item_idx):
        inputs = self.tokenizer(self.tweets[item_idx], padding="longest", truncation=True)
        item = {
            "input_ids": torch.LongTensor(inputs["input_ids"]),
            "token_type_ids": torch.LongTensor(inputs["token_type_ids"]),
            "attention_mask": torch.LongTensor(inputs["attention_mask"])            
        }
        if self.with_labels:
            targets = self.targets[item_idx]
            item["labels"] = torch.LongTensor([targets])
        return item

In [25]:
from datasets import Dataset

# Instead of creating a custom pytorch dataset, we use the HF dataset library
ds_train = Dataset.from_pandas(df=df_train)
raw_ds_col_names = ds_train.column_names
print(raw_ds_col_names)
print(ds_train)

['id', 'keyword', 'location', 'text', 'target', 'kfold', 'processed_text', 'prcsd_tweet_len']
Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'kfold', 'processed_text', 'prcsd_tweet_len'],
    num_rows: 7613
})


In [26]:
# Function to tokenize tweets, to be passed to HF datasets.map for tokenization in batches (performed parallely)
def tokenize_tweets(tokenizer, with_labels, data_row):
    encoding = tokenizer(
        text = data_row["text"],
        # we don't want to do padding while tokenizing the text (for the entire dataset) but while preparing batches
        # using DataCollatorWithPadding. 
        padding = False,
        # If the text length exceeds than what model can handle, truncate the text
        truncation = True
    )
    # For train and validation data the encoding needs to contain the labels as well in addition to input_ids, token_type_ids 
    # and attention_mask. For test data, labels are not present in encoding.
    if with_labels:
        encoding["labels"] = data_row["target"]
    return encoding

In [27]:
from functools import partial

preprocess_train_data = partial(tokenize_tweets, tokenizer, True)  
preprocess_test_data = partial(tokenize_tweets, tokenizer, False)  

In [28]:
# Create fold specific datasets using HF datasets library
def get_fold_data_hf(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    ds_train_raw = Dataset.from_pandas(train_df)
    ds_valid_raw = Dataset.from_pandas(valid_df)
    raw_ds_col_names = ds_train_raw.column_names    
    ds_train = ds_train_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)
    ds_valid = ds_valid_raw.map(preprocess_train_data, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)    
    return ds_train, ds_valid

In [29]:
def get_fold_data(fold, df, tokenizer, tweet_col="text"):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    X_train = train_df[tweet_col].values.tolist()
    y_train = train_df.target.values.tolist()
    X_valid = valid_df[tweet_col].values.tolist()
    y_valid = valid_df["target"].values.tolist()
    ds_train = TransformerTweetDataset(X_train, y_train, tokenizer)
    ds_valid = TransformerTweetDataset(X_valid, y_valid, tokenizer)    
    return ds_train, ds_valid

In [34]:
ds_train, ds_val = get_fold_data_hf(0, df_train)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [36]:
ds_train

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6090
})

In [14]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [15]:
def get_fold_training_args(fold):
    fold_out_dir = f"./fold_{fold}_results"
    return TrainingArguments(
        output_dir=fold_out_dir,
        evaluation_strategy="epoch",
        save_strategy='epoch',
        warmup_ratio=0.1,
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        weight_decay=0.01,
        learning_rate=2e-5,    
        gradient_accumulation_steps=8    
    )

In [15]:
for fold in range(Config.NUM_FOLDS):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    ds_train, ds_val = get_fold_data(fold, df_train, tokenizer, tweet_col=Config.TWEET_COL)
    model = BertForSequenceClassification.from_pretrained(
                Config.TRANSFORMER_CHECKPOINT, 
                num_labels=Config.OUT_SIZE
            )
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=get_fold_training_args(fold),   # training arguments, defined above
        train_dataset=ds_train,              # training dataset
        eval_dataset=ds_val,                 # evaluation dataset
        compute_metrics=compute_metrics,     # the callback that computes metrics of interest
        data_collator=data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    del trainer, model

Running training for fold0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.424999,0.818122
1,No log,0.384461,0.840446
2,No log,0.398279,0.833224
3,No log,0.410809,0.83782


***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_0_results/checkpoint-47
Configuration saved in ./fold_0_results/checkpoint-47/config.json
Model weights saved in ./fold_0_results/checkpoint-47/pytorch_model.bin
tokenizer config file saved in ./fold_0_results/checkpoint-47/tokenizer_config.json
Special tokens file saved in ./fold_0_results/checkpoint-47/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_0_results/checkpoint-94
Configuration saved in ./fold_0_results/checkpoint-94/config.json
Model weights saved in ./fold_0_results/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./fold_0_results/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./fold_0_results/checkpoint-94/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_0_results/checkpoint-141


Running training for fold1


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/bk_anupam/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin f

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.428781,0.824688
1,No log,0.38842,0.840446
2,No log,0.378548,0.853578
3,No log,0.392753,0.848326


***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_1_results/checkpoint-47
Configuration saved in ./fold_1_results/checkpoint-47/config.json
Model weights saved in ./fold_1_results/checkpoint-47/pytorch_model.bin
tokenizer config file saved in ./fold_1_results/checkpoint-47/tokenizer_config.json
Special tokens file saved in ./fold_1_results/checkpoint-47/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_1_results/checkpoint-94
Configuration saved in ./fold_1_results/checkpoint-94/config.json
Model weights saved in ./fold_1_results/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./fold_1_results/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./fold_1_results/checkpoint-94/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_1_results/checkpoint-141


Running training for fold2


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/bk_anupam/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin f

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.455982,0.805647
1,No log,0.417909,0.821405
2,No log,0.423304,0.828628
3,No log,0.424576,0.827971


***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_2_results/checkpoint-47
Configuration saved in ./fold_2_results/checkpoint-47/config.json
Model weights saved in ./fold_2_results/checkpoint-47/pytorch_model.bin
tokenizer config file saved in ./fold_2_results/checkpoint-47/tokenizer_config.json
Special tokens file saved in ./fold_2_results/checkpoint-47/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_2_results/checkpoint-94
Configuration saved in ./fold_2_results/checkpoint-94/config.json
Model weights saved in ./fold_2_results/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./fold_2_results/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./fold_2_results/checkpoint-94/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1523
  Batch size = 16
Saving model checkpoint to ./fold_2_results/checkpoint-141


Running training for fold3


loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/bk_anupam/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin f

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.438206,0.81866
1,No log,0.395755,0.8318
2,No log,0.402718,0.827201


***** Running Evaluation *****
  Num examples = 1522
  Batch size = 16
Saving model checkpoint to ./fold_3_results/checkpoint-47
Configuration saved in ./fold_3_results/checkpoint-47/config.json
Model weights saved in ./fold_3_results/checkpoint-47/pytorch_model.bin
tokenizer config file saved in ./fold_3_results/checkpoint-47/tokenizer_config.json
Special tokens file saved in ./fold_3_results/checkpoint-47/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1522
  Batch size = 16
Saving model checkpoint to ./fold_3_results/checkpoint-94
Configuration saved in ./fold_3_results/checkpoint-94/config.json
Model weights saved in ./fold_3_results/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./fold_3_results/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./fold_3_results/checkpoint-94/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1522
  Batch size = 16
Saving model checkpoint to ./fold_3_results/checkpoint-141


KeyboardInterrupt: 

In [25]:
ds_test = TransformerTweetDataset(df_test[Config.TWEET_COL].values.tolist(), None, tokenizer, with_labels=False)

In [16]:
best_model = BertForSequenceClassification.from_pretrained("./fold_1_results/checkpoint-141")

In [17]:
from datasets import Dataset

test_dataset = Dataset.from_pandas(df_test)
test_dataset

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'processed_text', 'prcsd_tweet_len'],
    num_rows: 3263
})

In [19]:
def tokenize_tweets(row):
    return tokenizer(row[Config.TWEET_COL], truncation=True)

In [13]:
test_inputs = tokenizer(df_test[Config.TWEET_COL].values.tolist(), padding="longest", truncation=True, return_tensors="pt")

In [20]:
tokenized_test_dataset = test_dataset.map(tokenize_tweets, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [21]:
tokenized_test_dataset

Dataset({
    features: ['attention_mask', 'id', 'input_ids', 'keyword', 'location', 'prcsd_tweet_len', 'processed_text', 'text', 'token_type_ids'],
    num_rows: 3263
})

In [29]:
test_args = TrainingArguments(
        output_dir="./predict",
        do_train=False,
        do_eval=False,
        do_predict=True
    )

test_trainer = Trainer(
    model=best_model,    
    args=test_args,
    eval_dataset=ds_test,
    data_collator=data_collator,
    tokenizer=tokenizer
)

test_preds = test_trainer.predict(ds_test)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running Prediction *****
  Num examples = 3263
  Batch size = 8


In [31]:
test_pred_labels = np.argmax(test_preds.predictions, axis=1)

In [33]:
df_submission = pd.read_csv(DATA_PATH + 'submission.csv')
df_submission['target']= test_pred_labels
df_submission.to_csv('my_submission.csv',index=False)