## BERT Real Data Testing Notebook

This notebook contains the code to test the trained Bert model on the twitter and NYT data

## Modules

In [3]:
from my_dataset import My_Dataset_Test
import torch
from torch.utils.data import DataLoader, Dataset
import evaluate
import pandas as pd 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from tqdm.auto import tqdm

## Model Loading

In [4]:
model_path = "checkpoints/bert-base-uncased_4000_0.pt"
MODEL = "bert-base-uncased"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
# Getting the tokenzier to use
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [6]:
# Defining function to compute the tokenization
def tokenize_function(data):
    # value = tokenizer(data["sequence"], padding="max_length", truncation=True)
    # return value['input_ids'], value['token_type_ids'], value['attention_mask']
    return tokenizer(data["sequence"], padding="max_length", truncation=True)

def prep_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Defining function that preps the data
    """
    
    # Applying the tokenizer on the inputs
    tokenized_values = df.apply(tokenize_function, axis=1)
    
    # Splitting the results into a dataframe
    tokenized_values = tokenized_values.apply(pd.Series)
    
    # Merging the tokenized values together
    df = pd.concat([df,tokenized_values], axis=1)
    
    # Renaming columns
    df = df.rename(columns = {"label": "labels"})
    
    # Dropping columns
    df = df.drop(['sequence'],axis=1)
    
    # Returning the dataset
    return df

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
state_dict = torch.load(model_path)

In [9]:
model.load_state_dict(state_dict['model_state_dict'])

<All keys matched successfully>

In [10]:
model = model.to(device)

In [11]:
model = model.eval()

In [12]:
softmax = torch.nn.Softmax(dim=-1)

## Twitter Data

### Running Model

In [13]:
# Loading data
twitter = pd.read_csv(f"data/twitter_scraped_cleaned.csv", index_col=0)

In [14]:
# Renaming column to match training data
twitter = twitter.rename({"text":"sequence", "twitter_handle": "company"}, axis=1)

In [15]:
# Prepping the data
twitter = prep_data(twitter)

In [16]:
# Creating dataset
twitter_dataset = My_Dataset_Test(twitter)

In [17]:
# Formulating data loader
dataloader = DataLoader(twitter_dataset, batch_size=1, shuffle=False)

In [18]:
progress_bar = tqdm(range(len(twitter)))

results = []

for batch in dataloader:
    # batch = {k: v.to(device) for k, v in batch.items()}
    batch_labels = [batch["company"][0], batch["date"][0]]
    batch_to_pass = {k: v.to(device) for k, v in batch.items() if k in ["labels", "input_ids", "token_type_ids", "attention_mask"]}

    with torch.no_grad():
        outputs = model(**batch_to_pass)

    logits = outputs.logits
    
    distribution = softmax(logits).to("cpu")[0]
    
    prediction = torch.argmax(logits, dim=-1).to("cpu")[0]
    
    batch_labels.append(prediction)
    batch_labels.append(distribution[prediction])
    
    results.append(batch_labels)
    
    progress_bar.update(1)

    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉| 10205/10209 [01:47<00:00, 87.58it/s]

In [19]:
# Converting the results into a dataframe
results_df = pd.DataFrame(results,columns = ["Company", "Date", "Prediction", "Confidence"])

In [20]:
# Converting some data types
results_df["Prediction"] = results_df["Prediction"].astype(int)
results_df["Confidence"] = results_df["Confidence"].astype(float)

In [22]:
# Writing to disk
results_df.to_csv("results/twitter_bert_results.csv")

## NYT Data

In [23]:
# Loading data
nyt = pd.read_csv(f"data/Api_data.csv", header=None)
nyt.columns = columns=["company", "source", "date", "text"]

In [24]:
# Renaming column to match training data
nyt = nyt.rename({"text":"sequence"}, axis=1)

In [25]:
# Dropping NAs
nyt = nyt[nyt["sequence"].isna() == False]

In [26]:
# Prepping data
nyt = prep_data(nyt)

In [27]:
# Creating dataset
nyt_dataset = My_Dataset_Test(nyt)

In [28]:
# Creating data loader
dataloader = DataLoader(nyt_dataset, batch_size=1, shuffle=False)

In [29]:
progress_bar = tqdm(range(len(nyt)))

results = []

for batch in dataloader:
    # batch = {k: v.to(device) for k, v in batch.items()}
    batch_labels = [batch["company"][0], batch["date"][0]]
    batch_to_pass = {k: v.to(device) for k, v in batch.items() if k in ["labels", "input_ids", "token_type_ids", "attention_mask"]}

    with torch.no_grad():
        outputs = model(**batch_to_pass)

    logits = outputs.logits
    
    distribution = softmax(logits).to("cpu")[0]
    
    prediction = torch.argmax(logits, dim=-1).to("cpu")[0]
    
    batch_labels.append(prediction)
    batch_labels.append(distribution[prediction])
    
    results.append(batch_labels)
    
    progress_bar.update(1)

    


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10209/10209 [04:20<00:00, 39.19it/s][A

  0%|▏                                                                                                                                                                                                                                      | 7/9257 [00:00<02:20, 65.84it/s][A
  0%|▍                                                                                                                                                                                                                                     | 17/9257 [00:00<01:54, 80.48it/s][A
  0%|▋                                                                                                                                                                             

In [30]:
# Converting results to data frame
results_df = pd.DataFrame(results,columns = ["Company", "Date", "Prediction", "Confidence"])

In [31]:
# Converting some data types
results_df["Prediction"] = results_df["Prediction"].astype(int)
results_df["Confidence"] = results_df["Confidence"].astype(float)

In [32]:
# Writing to disk
results_df.to_csv("results/nyt_bert_results.csv")


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9257/9257 [01:49<00:00, 97.27it/s][A