# BERT Testing Notebook
This notebook contains the code to test the bert model

## Modules

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from my_dataset import My_Dataset
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
import dataframe_image as dfi
import time
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
2023-04-08 12:40:34.343343: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MODEL = "bert-base-uncased"

In [3]:
# Getting the tokenzier to use
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Defining function to compute the tokenization
def tokenize_function(data):
    # value = tokenizer(data["sequence"], padding="max_length", truncation=True)
    # return value['input_ids'], value['token_type_ids'], value['attention_mask']
    return tokenizer(data["sequence"], padding="max_length", truncation=True)

def prep_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Defining function that preps the data
    """
    
    # Applying the tokenizer on the inputs
    tokenized_values = df.apply(tokenize_function, axis=1)
    
    # Splitting the results into a dataframe
    tokenized_values = tokenized_values.apply(pd.Series)
    
    # Merging the tokenized values together
    df = pd.concat([df,tokenized_values], axis=1)
    
    # Renaming columns
    df = df.rename(columns = {"label": "labels"})
    
    # Dropping columns
    df = df.drop(['sequence'],axis=1)
    
    
    
    # Returning the dataset
    return df

## Model Loading

In [4]:
MODEL_CHECKPOINT = "checkpoints/bert-base-uncased_4000_0.pt"

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

In [7]:
state_dict = torch.load(MODEL_CHECKPOINT)

In [8]:
model.load_state_dict(state_dict['model_state_dict'])

<All keys matched successfully>

### Testing (All Data)

In [9]:
NUM_SAMPLES = 4000

In [10]:
# Loading the in the splits for each dataset
train = pd.read_csv(f"data/train_{NUM_SAMPLES}.csv")
test = pd.read_csv(f"data/test_{NUM_SAMPLES}.csv")
val = pd.read_csv(f"data/val_{NUM_SAMPLES}.csv")

In [11]:
# Prepping the data
train = prep_data(train)
test = prep_data(test)
val = prep_data(val)

In [12]:
# Defining the data as pytorch datasets
train_dataset = My_Dataset(train)
val_dataset = My_Dataset(val)
test_dataset = My_Dataset(test)

In [13]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
val_dataloader = DataLoader(val_dataset, batch_size=1)
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [14]:
# Defining list to hold the results
results_all = []

#### Validation

In [15]:
softmax = torch.nn.Softmax(dim=-1)

In [16]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_all.append({"Data Split": "Validation", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

In [17]:
results_all

[{'Data Split': 'Validation',
  'F1': 0.8179487179487179,
  'Accuracy': 0.8225,
  'Precision': 0.7935323383084577,
  'Recall': 0.843915343915344}]

#### Train

In [18]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_all.append({"Data Split": "Train", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

#### Test

In [19]:
test_dataloader = DataLoader(test_dataset, batch_size=1)

In [20]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_all.append({"Data Split": "Test", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

In [21]:
# Converting to dataframe
results_all = pd.DataFrame(results_all)

In [22]:
# Writing to disk
results_all.to_csv("results/bert_results_all_data.csv")

### Testing (Speed)

In [23]:
t0 = time.time()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
t1 = time.time()

print("TIME: ", t1 - t0)
    

TIME:  15.02766466140747


In [24]:
# Formulating DF
rows = [
    {"Model": "BERT", "Data Split": "Test", "Number of Sequences": 1600, "Time (seconds)": 14.875643253326416, "Sequences per Second": 107.55837396424629},
    {"Model": "LSTM", "Data Split": "Test", "Number of Sequences": 1600, "Time (seconds)": 0.5176246166229248, "Sequences per Second": 3091.0431007680527}
]

time_df = pd.DataFrame(rows)

In [25]:
# Rounding values
time_df["Sequences per Second"] = round(time_df["Sequences per Second"], 2)
time_df["Sequences per Second"] = time_df["Sequences per Second"].astype(str)

time_df["Time (seconds)"] = round(time_df["Time (seconds)"], 2)
time_df["Time (seconds)"] = time_df["Time (seconds)"].astype(str)

In [26]:
# Setting the index
time_df = time_df.set_index("Model")

In [27]:
# Setting styles and writing to disk
time_df = time_df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
time_df.set_properties(**{'text-align': 'center'})
dfi.export(time_df, 'results/test_timing.png')

/usr/bin/google-chrome


### Testing (News Data)

In [28]:
# Loading the in the splits for each dataset
test = pd.read_csv(f"data/test_{NUM_SAMPLES}_news.csv")
val = pd.read_csv(f"data/val_{NUM_SAMPLES}_news.csv")
train = pd.read_csv(f"data/train_{NUM_SAMPLES}_news.csv")

In [29]:
# Prepping the data
train = prep_data(train)
test = prep_data(test)
val = prep_data(val)

In [30]:
# Defining the data as pytorch datasets
val_dataset = My_Dataset(val)
test_dataset = My_Dataset(test)
train_dataset = My_Dataset(train)

In [31]:
# Creaing list to hold results
results_news = []

In [32]:
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)
train_dataloader = DataLoader(train_dataset, batch_size=8)

#### Validation

In [33]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_news.append({"Data Split": "Train", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

In [34]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_news.append({"Data Split": "Validation", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

#### Test

In [35]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_news.append({"Data Split": "Test", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

In [36]:
# Converting to dataframe
results_news = pd.DataFrame(results_news)

In [37]:
# Writing to disk
results_news.to_csv("results/bert_results_news_data.csv")

### Testing (Twitter Data)

In [38]:
# Loading the in the splits for each dataset
test = pd.read_csv(f"data/test_{NUM_SAMPLES}_tweets.csv")
val = pd.read_csv(f"data/val_{NUM_SAMPLES}_tweets.csv")
train = pd.read_csv(f"data/train_{NUM_SAMPLES}_tweets.csv")

In [39]:
# Prepping the data
test = prep_data(test)
val = prep_data(val)
train = prep_data(train)

In [40]:
# Creating list to hold results
results_twitter = []

In [41]:
# Defining the data as pytorch datasets
val_dataset = My_Dataset(val)
test_dataset = My_Dataset(test)
train_dataset = My_Dataset(train)

In [42]:
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)
train_dataloader = DataLoader(train_dataset, batch_size=8)

#### Validation

In [43]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_twitter.append({"Data Split": "Validation", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

#### Train

In [44]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_twitter.append({"Data Split": "Train", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

#### Test

In [45]:
f1 = evaluate.load("f1")
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(softmax(logits))
    f1.add_batch(predictions=predictions, references=batch["labels"])
    accuracy.add_batch(predictions=predictions, references=batch["labels"])
    precision.add_batch(predictions=predictions, references=batch["labels"])
    recall.add_batch(predictions=predictions, references=batch["labels"])

results_twitter.append({"Data Split": "Test", "F1": f1.compute()['f1'], "Accuracy": accuracy.compute()['accuracy'], "Precision": precision.compute()['precision'], "Recall": recall.compute()['recall']})

In [46]:
results_twitter = pd.DataFrame(results_twitter)

In [47]:
results_twitter.to_csv("results/bert_results_twitter_data.csv")

## Results Merging

In [48]:
# Reading in the data
results_twitter = pd.read_csv("results/bert_results_twitter_data.csv", index_col=0).drop(["Precision", "Recall"], axis=1)
results_news = pd.read_csv("results/bert_results_news_data.csv", index_col=0).drop(["Precision", "Recall"], axis=1)
results_all = pd.read_csv("results/bert_results_all_data.csv", index_col=0).drop(["Precision", "Recall"], axis=1)

In [49]:
# Merging the results together
results = pd.merge(pd.merge(results_all,results_twitter, on="Data Split"), results_news, on="Data Split")

In [50]:
# Rounding the datra
results = results.round(2)

In [51]:
# Setting the index
results.set_index("Data Split", inplace=True)

In [52]:
# Creating multi-indexed columns
types = ["All Data" for i in range(0,2)] + ["Sentiment140 Split" for i in range(0,2)] + ["NewsMTSC Split" for i in range(0,2)]
columns = ["Accuracy", "F1"] + ["Accuracy", "F1"]  + ["Accuracy", "F1"]
multi_index_columns = list(zip(types, columns))
results.columns = pd.MultiIndex.from_tuples(multi_index_columns)

In [53]:
results = results.astype(str)

In [54]:
# Reordering rows
results = results.reindex(["Train", "Validation", "Test"])

In [55]:
results

Unnamed: 0_level_0,All Data,All Data,Sentiment140 Split,Sentiment140 Split,NewsMTSC Split,NewsMTSC Split
Unnamed: 0_level_1,Accuracy,F1,Accuracy,F1,Accuracy,F1
Data Split,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Train,0.89,0.9,0.87,0.87,0.92,0.94
Validation,0.82,0.82,0.81,0.8,0.82,0.84
Test,0.8,0.82,0.8,0.8,0.81,0.84


In [56]:
# Setting styles and writing to disk
results = results.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
results.set_properties(**{'text-align': 'center'})
dfi.export(results, 'results/bert_results_table.png')

/usr/bin/google-chrome
