## Bert: Neraul Network Model for NLP

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.getcwd()

'/content'

In [3]:
import torch

In [4]:
torch.cuda.is_available()

True

In [5]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

+ We are gonna use modules Transformers and datasets from Hugging Face

In [6]:
import datasets
import transformers

In [7]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

In [8]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Load data set & Preprocessing

+ Won't remove stop word here

In [9]:
#remove punctuation, remove marks, remove urls, lower case,remove numbers, lowercase
#remove ascii chatachters
def removePuncStr(s):
    for c in string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§':
        s = s.replace(c, ' ').strip()
    return s

def removePunc(array):
    return [removePuncStr(str_) for str_ in array]

#remove numbers
def removeNumbersStr(s):
    for d in range(10):
        s = s.replace(str(d), ' ')
    return s

def removeNumbers(array):
    return [removeNumbersStr(str_) for str_ in array]

def lemmatize(text_array):
    WNlemmatizer = WordNetLemmatizer()
    lemmatized_text = []
    for h in text_array:
        words = word_tokenize(h)
        h2 = ''
        for w in words:
            h2 = h2 + ' ' + WNlemmatizer.lemmatize(w)
        lemmatized_text.append(h2)
    return lemmatized_text

def removeSpacesStr(s):
    spaces = ['\t', '\r', '\n']
    for space in spaces:
        s = s.replace(space, ' ')
        
def removeSpaces(array):
    return [removeNumbersStr(sent) for sent in array]

def Lowercase(array):
    return [str(sent).lower() for sent in array]

def removeUrl(text_array):
    pattern = r"https://.+\S*"
    removed = []
    for s in text_array:
        s = re.sub(pattern, ' ', s)
        removed.append(s)
    return removed
def removeAscii(text_array):
    return [re.sub(r'[^\x00-\x7f]', '', s) for s in text_array]

In [10]:
def clean(text_array):
    text_array = removeUrl(text_array)
    text_array = removePunc(text_array)
    text_array = removeNumbers(text_array)
    text_array = removeSpaces(text_array)
    text_array = Lowercase(text_array)
    text_array = lemmatize(text_array)
    text_array = removeAscii(text_array)
    return text_array

In [11]:
label_mapper = {
    'Neutral' : 0,
    'Positive' : 1,
    'Extremely Positive' : 2,
    'Negative' : 3,
    'Extremely Negative' : 4
}

In [12]:
# prepare the data for initializing a dataset for Bert
def makeDataset(dataset):
    dataset = dataset[['OriginalTweet', 'Sentiment']]
    text_array = dataset['OriginalTweet']
    text_array = clean(text_array)
    dataset['Text'] = text_array
    dataset = dataset.drop(columns=['OriginalTweet'])
    dataset = dataset.rename(columns = {'Sentiment': 'Labels'})
    dataset['Labels'] = dataset['Labels'].map(label_mapper)
    return dataset

In [13]:
train_set = pd.read_csv('drive/MyDrive/CoronaNLP/Corona_NLP_train.csv', encoding='ISO-8859-1')
test_set = pd.read_csv('drive/MyDrive/CoronaNLP/Corona_NLP_test.csv', encoding='ISO-8859-1')

train_set = makeDataset(train_set)
test_set = makeDataset(test_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
train_set.head()

Unnamed: 0,Labels,Text
0,0,menyrbie phil gahan chrisitv
1,1,advice talk to your neighbour family to excha...
2,1,coronavirus australia woolworth to give elder...
3,1,my food stock is not the only one which is em...
4,4,me ready to go at supermarket during the covi...


## Convert Pandas DataFrame to Dataset object for training

In [15]:
from datasets import Dataset

train_data = Dataset.from_pandas(train_set)
test_data = Dataset.from_pandas(test_set)

In [16]:
label_id = {
    'Neutral' : 0,
    'Positive' : 1,
    'Extremely Positive' : 2,
    'Negative' : 3,
    'Extremely Negative' : 4
}

In [17]:
id2label = {id_:name for name, id_ in label_id.items()}
id2label

{0: 'Neutral',
 1: 'Positive',
 2: 'Extremely Positive',
 3: 'Negative',
 4: 'Extremely Negative'}

In [18]:
#splits train and validation set
#use train_test_split in Datasets module

splitted = train_data.train_test_split(test_size=0.2, shuffle=True, seed=222)
train_data, val_data = splitted['train'], splitted['test']

from datasets import DatasetDict
datasets = DatasetDict({'train' : train_data, 'val' : val_data, 'test' : test_data})


In [19]:
datasets

DatasetDict({
    train: Dataset({
        features: ['Labels', 'Text'],
        num_rows: 32925
    })
    val: Dataset({
        features: ['Labels', 'Text'],
        num_rows: 8232
    })
    test: Dataset({
        features: ['Labels', 'Text'],
        num_rows: 3798
    })
})

## Preprocessing: Tokenize the corpus

In [20]:
## Load Bert Model and Tokenizer
from transformers import BertForSequenceClassification, BertTokenizer

bertModel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_id), id2label=id2label, label2id=label_id)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [21]:
## find max sequence length
max_length = 0
max_text = ''

for text in train_set['Text']:
    if(len(text.split(' ')) > max_length):
        max_length = len(text.split(' '))
        max_text = text
print(max_length)
print(max_text)

66
 i want to go shopping i want to walk w o the thought i might get the covid virus i want a real hug i want to go eat at a dine in restaurant ohhh to be free w o the thought of that darn virus i want to go shopping for shoe i don t need just want to go shopping no more online


In [22]:
%%time

max_sequence_length = 128

def tokenization(dataset):
    
    args = ((dataset['Text'], ))
    result = tokenizer(*args, padding='max_length', max_length=max_sequence_length, truncation=True)
    result['label'] = [label for label in dataset['Labels']]
    
    return result

datasets = datasets.map(tokenization, batched=True, desc='Tokenization')



Tokenization:   0%|          | 0/33 [00:00<?, ?ba/s]

Tokenization:   0%|          | 0/9 [00:00<?, ?ba/s]

Tokenization:   0%|          | 0/4 [00:00<?, ?ba/s]

CPU times: user 40.9 s, sys: 262 ms, total: 41.1 s
Wall time: 41.2 s


In [23]:
datasets['train']

Dataset({
    features: ['Labels', 'Text', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 32925
})

In [24]:
for i in range(len(datasets['train'])):
    if(i<=3):
        print('keys: ', datasets['train'][i].keys())
        print('text: ', datasets['train'][i]['Text'])
        print(len(datasets['train'][i]['input_ids']))
        print('tokenized sequence :', tokenizer.decode(datasets['train'][i]['input_ids']))
        print('end-----------------------------------------------------------------------')

keys:  dict_keys(['Labels', 'Text', 'input_ids', 'token_type_ids', 'attention_mask', 'label'])
text:   overheard at the grocery store i like frozen food better than fresh anyways  boyyyy do i have an ex for you girl coronavirus
128
tokenized sequence : [CLS] overheard at the grocery store i like frozen food better than fresh anyways boyyyy do i have an ex for you girl coronavirus [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
end--------------------

In [25]:
## Training

from datasets import load_metric

metric_name = 'accuracy'
metric = load_metric(metric_name)

def compute_metrics(returned):
    y_pred = returned.predictions[0] if isinstance(returned.predictions, tuple) else returned.predictions
    y_pred = np.argmax(y_pred, axis=1)
    
    return {"accuracy" : (y_pred == returned.label_ids).astype(np.float32).mean().item()}

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [26]:
## Set Training Arguments

from transformers import Trainer, TrainingArguments

batch_size = 32

training_args = TrainingArguments(
'Corona_NLP_Bert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate= 1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to='none'
)

In [27]:
## Initialize Trainer
debug = False

trainer = Trainer(
    model = bertModel,
    args = training_args,
    train_dataset=datasets['train'].select(range(10)) if debug else datasets['train'],
    eval_dataset = datasets['val'].select(range(10)) if debug else datasets['val'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [28]:
## Train
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Labels. If Text, Labels are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 32925
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 20580


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6827,0.602485,0.775996
2,0.4947,0.536611,0.811346
3,0.3773,0.527391,0.82896
4,0.2964,0.534616,0.83467
5,0.2385,0.577377,0.831633
6,0.1992,0.642801,0.815476
7,0.1619,0.65638,0.828231
8,0.1459,0.764589,0.82896
9,0.1289,0.760856,0.834913
10,0.1016,0.891111,0.812925


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Labels. If Text, Labels are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8232
  Batch size = 32
Saving model checkpoint to Corona_NLP_Bert/checkpoint-1029
Configuration saved in Corona_NLP_Bert/checkpoint-1029/config.json
Model weights saved in Corona_NLP_Bert/checkpoint-1029/pytorch_model.bin
tokenizer config file saved in Corona_NLP_Bert/checkpoint-1029/tokenizer_config.json
Special tokens file saved in Corona_NLP_Bert/checkpoint-1029/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Labels. If Text, Labels are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Runnin

TrainOutput(global_step=20580, training_loss=0.16450459276071674, metrics={'train_runtime': 15063.6357, 'train_samples_per_second': 43.715, 'train_steps_per_second': 1.366, 'total_flos': 4.3315824203136e+16, 'train_loss': 0.16450459276071674, 'epoch': 20.0})

In [29]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text, Labels. If Text, Labels are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 8232
  Batch size = 32


{'epoch': 20.0,
 'eval_accuracy': 0.8426870703697205,
 'eval_loss': 1.248214840888977,
 'eval_runtime': 61.1969,
 'eval_samples_per_second': 134.517,
 'eval_steps_per_second': 4.216}

In [30]:

test_set = datasets['test']
test_set = test_set.remove_columns("Labels")
y_pred = trainer.predict(test_set, metric_key_prefix='predict').predictions
y_pred = np.argmax(y_pred, axis=1)

df_test = test_set.to_pandas()
df_test['pred_sent'] = [id2label[item] for item in y_pred]
output_predict_file = os.path.join(training_args.output_dir, 'predict_results.csv')
df_test.to_csv(output_predict_file, index=False)

from sklearn.metrics import accuracy_score, f1_score, classification_report

y_true = [l for l in df_test['label']]
y_pred = list(y_pred)
print(classification_report(y_true, y_pred))

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: Text. If Text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3798
  Batch size = 32


              precision    recall  f1-score   support

           0       0.87      0.80      0.84       619
           1       0.76      0.82      0.79       947
           2       0.88      0.78      0.83       599
           3       0.77      0.83      0.80      1041
           4       0.87      0.81      0.84       592

    accuracy                           0.81      3798
   macro avg       0.83      0.81      0.82      3798
weighted avg       0.82      0.81      0.81      3798



In [31]:
## We can see the performance of the classification model here
for i in range(10):
  text = df_test.iloc[i]['Text']
  print('######Original Text: ', text)
  encoding = tokenizer(text, return_tensors='pt')
  encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}

  outputs = trainer.model(**encoding)
  softmax = torch.nn.Softmax(dim=-1)
  label_pred = softmax(outputs.logits).argmax().item()
  print('######Predicted Label: ', id2label[label_pred])
  print('######Original Label: ', id2label[df_test.iloc[i]['label']])
  print("-----------------------------------------------------------")

######Original Text:   trending new yorkers encounter empty supermarket shelf pictured wegmans in brooklyn sold out online grocer foodkick maxdelivery a coronavirus fearing shopper stock up
######Predicted Label:  Negative
######Original Label:  Extremely Negative
-----------------------------------------------------------
######Original Text:   when i couldn t find hand sanitizer at fred meyer i turned to amazon but for a pack of purell check out how coronavirus concern are driving up price
######Predicted Label:  Positive
######Original Label:  Positive
-----------------------------------------------------------
######Original Text:   find out how you can protect yourself and loved one from coronavirus
######Predicted Label:  Extremely Positive
######Original Label:  Extremely Positive
-----------------------------------------------------------
######Original Text:   panic buying hit newyork city a anxious shopper stock up on food amp medical supply after healthcare worker in her s b