Implementation of RoBERTa

In [13]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch import nn
from torch.optim import Adam
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import random
import re
import nltk
import string
import matplotlib.pyplot as plt
import sys
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\am87383\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [14]:
#pip install torch

In [15]:
sample_submission = pd.read_csv("sample_submission.csv")
train_df = pd.read_csv("train_nlp.csv")
test_df = pd.read_csv("test_nlp.csv")

In [16]:
keywords = train_df.keyword.values
texts = train_df.text.values
contained_keyword_num = 0
nan_num = 0
for keyword,text in zip(keywords,texts):
    if isinstance(keyword, str):
        if keyword.lower() in text.lower():
            contained_keyword_num += 1
    else:
        nan_num += 1
print(f'Total data num = {keywords.shape[0]}\n'
      f'Num of contained-keyword sentence = {contained_keyword_num}\n'
      f'Num of nan keyword = {nan_num}\n'
      f'The ratio of keyword in sentence = {contained_keyword_num / (keywords.shape[0] - nan_num)}')

Total data num = 7613
Num of contained-keyword sentence = 5973
Num of nan keyword = 61
The ratio of keyword in sentence = 0.790916313559322


In [18]:
keywords.shape

(7613,)

In [19]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [20]:
print('#### training set ####')
display(train_df)
print('\n#### testing set ####')
display(test_df)
print('\n#### sample_submission ####')
display(sample_submission)

#### training set ####


Unnamed: 0,id,keyword,location,text,target
4996,7128,military,Texas,Courageous and honest analysis of need to use ...,1
3263,4688,engulfed,,@ZachZaidman @670TheScore wld b a shame if tha...,0
4907,6984,massacre,Cottonwood Arizona,Tell @BarackObama to rescind medals of 'honor'...,1
2855,4103,drought,"Spokane, WA",Worried about how the CA drought might affect ...,1
4716,6706,lava,"Medan,Indonesia",@YoungHeroesID Lava Blast &amp; Power Red #Pan...,0
...,...,...,...,...,...
5226,7470,obliteration,Merica!,@Eganator2000 There aren't many Obliteration s...,0
5390,7691,panic,,just had a panic attack bc I don't have enough...,0
860,1242,blood,,Omron HEM-712C Automatic Blood Pressure Monito...,0
7603,10862,,,Officials say a quarantine is in place at an A...,1



#### testing set ####


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...



#### sample_submission ####


Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [21]:
class BertTweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        sentences = dataframe.text.values.tolist()
        sentences = [self._preprocess(sentence) for sentence in sentences]
        self.sentences = sentences
        self.tokenized_sentences = [tokenizer(sentence, padding='max_length',
                                max_length=70,
                                truncation=True,
                                return_tensors="pt")
                      for sentence in sentences]

        if 'target' in dataframe:
            classes = dataframe.target.values.tolist()
            self.labels = classes
    def _preprocess(self, sentence):
        sentence = self._remove_amp(sentence)
        sentence = self._remove_links(sentence)
        sentence = self._remove_hashes(sentence)
        sentence = self._remove_retweets(sentence)
        sentence = self._remove_mentions(sentence)
        sentence = self._remove_multiple_spaces(sentence)
#         sentence = self._lowercase(sentence)
        sentence = self._remove_punctuation(sentence)
        tokens = self._tokenize(sentence)
#         tokens = self._stopword_filtering(tokens)
#         tokens = self._lemmatization(tokens)
        sentence = self._stitch_text_tokens_together(tokens)
        return sentence.strip()
    def _remove_amp(self, sentence):
        return sentence.replace("&amp;", " ")
    
    def _remove_links(self, sentence):
        return re.sub(r'https?:\/\/[^\s\n\r]+', ' ', sentence)
    
    def _remove_hashes(self, sentence):
        return re.sub(r'#', ' ', sentence)
    
    def _remove_retweets(self, sentence):
        return re.sub(r'^RT[\s]+', ' ', sentence)

    def _remove_mentions(self, sentence):
        return re.sub(r'(@.*?)[\s]', ' ', sentence)
    
    def _remove_multiple_spaces(self, sentence):
        return re.sub(r'\s+', ' ', sentence)
    
    def _lowercase(self, sentence):
        return sentence.lower()
    
    def _remove_punctuation(self, sentence):
        return ''.join(character for character in sentence if character not in string.punctuation)
    
    def _tokenize(self, sentence):
        return nltk.word_tokenize(sentence, language="english")
    
    def _stopword_filtering(self, tokens):
        stop_words = nltk.corpus.stopwords.words('english')
        return [token for token in tokens if token not in stop_words]
    
    def _lemmatization(self, tokens):
        wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
        return [wordnet_lemmatizer.lemmatize(token, pos='v') for token in tokens]

    def _stemming(self, tokens):
        porter = nltk.stem.porter.PorterStemmer()
        return [porter.stem(token) for token in tokens]

    def _stitch_text_tokens_together(self, text_tokens):
        return " ".join(text_tokens)

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        sentence = self.tokenized_sentences[idx]
        label = -1
        if hasattr(self, 'labels'):
            label = self.labels[idx]
        return sentence, label

In [22]:
class BertBasedTweetClassifier(nn.Module):
    def __init__(self, bert_based_model):
        super().__init__()
        self.bert = bert_based_model
        self.dropout1 = nn.Dropout(0.6)
        self.linear1 = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0] #self.bert(...)[0] -> (batch_size, seq_len, hidden_state_size)
        x = self.dropout1(bert_out)
        x = self.linear1(x)
        x = self.sigmoid(x)
        return x

In [23]:
def train_with_bert(model, train_dataloader, val_dataloader, learning_rate, epochs):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    loss_func = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    model = model.to(device)
    loss_func = loss_func.to(device)
    best_val_loss = sys.float_info.max
    no_promotion_count = 0
    for epoch in range(epochs):
        count_for_successfully_pred_in_training_set = 0
        total_train_loss = 0
        model.train()
        for train_input, train_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device) #from (batch_size,1,150) -> (batch_size, 150)
            train_label = train_label.to(device)
            output = model(input_ids, attention_mask)
            loss = loss_func(output, train_label.float().unsqueeze(1))
            total_train_loss += loss.item()
            acc = ((output >= 0.5).int() == train_label.unsqueeze(1)).sum().item()
            count_for_successfully_pred_in_training_set += acc
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            count_for_successfully_pred_in_val_set = 0
            total_val_loss = 0
            model.eval()
            for val_input, val_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)
                val_label = val_label.to(device)
                output = model(input_ids, attention_mask)
                loss = loss_func(output, val_label.float().unsqueeze(1))
                total_val_loss += loss.item()
                acc = ((output >= 0.5).int() == val_label.unsqueeze(1)).sum().item()
                count_for_successfully_pred_in_val_set += acc

            avg_train_loss = total_train_loss / len(train_dataloader)
            avg_val_loss = total_val_loss / len(val_dataloader)
            avg_train_acc = count_for_successfully_pred_in_training_set / len(train_dataloader.dataset)
            avg_val_acc = count_for_successfully_pred_in_val_set / len(val_dataloader.dataset)
            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {avg_train_loss: .4f} '
                  f'| Train Accuracy: {avg_train_acc: .4f} '
                  f'| Val Loss: {avg_val_loss: .4f} '
                  f'| Val Accuracy: {avg_val_acc: .4f}')
            
            if best_val_loss > avg_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model, f"best_bert_model.pt")
                print("Saved model")
                no_promotion_count = 0
            else:
                no_promotion_count += 1
                
            if no_promotion_count >= 10:
                print("Early stopping")
                break

In [24]:
BERT_MODEL_TYPE = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_TYPE)
bert_based_model = AutoModel.from_pretrained(BERT_MODEL_TYPE)
bert_based_tweet_classifier = BertBasedTweetClassifier(bert_based_model)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
train_dataloader = DataLoader(BertTweetDataset(train_df, tokenizer), batch_size=16, shuffle=True)
val_dataloader = DataLoader(BertTweetDataset(val_df, tokenizer), batch_size=16)
test_dataloader = DataLoader(BertTweetDataset(test_df, tokenizer), batch_size=16)

In [26]:
# Check length
max_num_of_tokens = 0
keep_sentence = ''
count = 0
loader = val_dataloader
for sentence in loader.dataset.sentences:
    num_of_tokens = tokenizer(sentence, return_tensors="pt")['input_ids'].shape[1]
    count += num_of_tokens
    keep_sentence = sentence if max_num_of_tokens < num_of_tokens else keep_sentence
    max_num_of_tokens = num_of_tokens if max_num_of_tokens < num_of_tokens else max_num_of_tokens
print(f'After text preprocessing:\nMax length of tokens in one sentence= {max_num_of_tokens} | Average length of tokens in one sentence= {count / len(loader.dataset.sentences)}')

After text preprocessing:
Max length of tokens in one sentence= 53 | Average length of tokens in one sentence= 19.514773473407747


In [27]:
learning_rate = 1e-5
epochs = 16
train_with_bert(bert_based_tweet_classifier, train_dataloader, val_dataloader, learning_rate, epochs)

100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:56<00:00,  5.19s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:35<00:00,  1.62s/it]


Epochs: 1 | Train Loss:  0.4734 | Train Accuracy:  0.7749 | Val Loss:  0.3771 | Val Accuracy:  0.8431
Saved model


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:33<00:00,  5.13s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:28<00:00,  1.54s/it]


Epochs: 2 | Train Loss:  0.3625 | Train Accuracy:  0.8514 | Val Loss:  0.3780 | Val Accuracy:  0.8372


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:27<00:00,  5.11s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:33<00:00,  1.60s/it]


Epochs: 3 | Train Loss:  0.3068 | Train Accuracy:  0.8762 | Val Loss:  0.4093 | Val Accuracy:  0.8326


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:27<00:00,  5.11s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:33<00:00,  1.60s/it]


Epochs: 4 | Train Loss:  0.2446 | Train Accuracy:  0.9018 | Val Loss:  0.5105 | Val Accuracy:  0.8424


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:33<00:00,  5.13s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:28<00:00,  1.55s/it]


Epochs: 5 | Train Loss:  0.1900 | Train Accuracy:  0.9241 | Val Loss:  0.6361 | Val Accuracy:  0.8070


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [32:14<00:00,  5.08s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:28<00:00,  1.54s/it]


Epochs: 6 | Train Loss:  0.1541 | Train Accuracy:  0.9399 | Val Loss:  0.6199 | Val Accuracy:  0.8214


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [31:59<00:00,  5.04s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:35<00:00,  1.62s/it]


Epochs: 7 | Train Loss:  0.1210 | Train Accuracy:  0.9535 | Val Loss:  0.6371 | Val Accuracy:  0.8194


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [31:48<00:00,  5.01s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:31<00:00,  1.58s/it]


Epochs: 8 | Train Loss:  0.1068 | Train Accuracy:  0.9603 | Val Loss:  0.7290 | Val Accuracy:  0.8083


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [31:56<00:00,  5.03s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:30<00:00,  1.56s/it]


Epochs: 9 | Train Loss:  0.0809 | Train Accuracy:  0.9673 | Val Loss:  0.8699 | Val Accuracy:  0.8070


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [31:53<00:00,  5.02s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:30<00:00,  1.57s/it]


Epochs: 10 | Train Loss:  0.0797 | Train Accuracy:  0.9693 | Val Loss:  0.8843 | Val Accuracy:  0.8135


100%|████████████████████████████████████████████████████████████████████████████████| 381/381 [31:56<00:00,  5.03s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [02:30<00:00,  1.57s/it]

Epochs: 11 | Train Loss:  0.0639 | Train Accuracy:  0.9736 | Val Loss:  0.8605 | Val Accuracy:  0.8102
Early stopping





In [28]:
def disaster_predictions_for_bert(model, loader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model = model.to(device)
    results_predictions = []
    with torch.no_grad():
        model.eval()
        for data_input, _ in tqdm(loader):
            attention_mask = data_input['attention_mask'].to(device)
            input_ids = data_input['input_ids'].squeeze(1).to(device)
            output = model(input_ids, attention_mask)
            output = (output > 0.5).int()
            results_predictions.append(output)
    return torch.cat(results_predictions, 0).cpu().numpy()

In [29]:
predict_model = torch.load("best_bert_model.pt")
test_data_prediction = disaster_predictions_for_bert(predict_model, test_dataloader)
test_data_prediction = test_data_prediction.reshape(-1,)
sample_submission["target"] = test_data_prediction
display(sample_submission.head(10))

100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [05:36<00:00,  1.65s/it]


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [30]:
sample_submission.to_csv("submission_bert5.csv", index=False)

Below, we test for our scraped dataset: 

In [52]:
test2_df = pd.read_csv("scrape_test.csv", encoding= 'unicode_escape')

In [57]:
test_dataloader = DataLoader(BertTweetDataset(test2_df, tokenizer), batch_size=16)
predict_model = torch.load("best_bert_model.pt")
test_data_prediction = disaster_predictions_for_bert(predict_model, test_dataloader)
test_data_prediction = test_data_prediction.reshape(-1,)


100%|████████████████████████████████████████████████████████████████████████████████| 128/128 [03:32<00:00,  1.66s/it]


In [35]:
test_data_prediction

array([1, 1, 0, ..., 0, 0, 0])

In [53]:
y_true = test2_df["target"]

In [54]:
y_true_arr = y_true.to_numpy()

In [55]:
y_true_arr

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [56]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true_arr, test_data_prediction)

array([[1027,  279],
       [ 126,  605]], dtype=int64)