In [44]:
import pandas as pd
import torch
import logging
import os
import time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime
import numpy as np

In [78]:
log_filename = f'testing_logs/testing_log_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'
logging.basicConfig(filename=log_filename, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

In [46]:
def load_category_data(file_path: str, category: str):
    df = pd.read_feather(file_path)
    # add a column for the category
    df['category'] = category
    return df

In [47]:
# load the testing datasets
testing_datasets = {
  'crime': ['snope'],
  'health': ['covid_claims'],
  'politics': ['pheme', 'liar_dataset', ],
  'science': ['climate_dataset'],
  'social_media': ['gossipcop']
}

data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in testing_datasets.items() for dataset in datasets])
print(data.groupby('category').count())

               text  label  metadata  title
category                                   
crime           315    315       315      0
health         2814   2821      2821   2821
politics      41913  41952     41952      0
science         907    907       907      0
social_media  44898  44898     44898      0


In [79]:
# keep only the text and category columns
data = data[['text', 'category']]
data.dropna(inplace=True)
print(data.groupby('category').count())
logging.info(f'Number of samples: {len(data)}')
logging.info(data.groupby('category').count())

               text
category           
crime           315
health         2814
politics      41913
science         907
social_media  44898


In [49]:
data['encoded_category'] = data['category'].astype('category').cat.codes
data.head()

Unnamed: 0,text,category,encoded_category
174,Was an Italian Economist Removed from a Plane ...,crime,0
8524,Shelby Township Meijer Human Trafficking Warning,crime,0
8523,Shelby Township Meijer Human Trafficking Warning,crime,0
8522,Shelby Township Meijer Human Trafficking Warning,crime,0
8521,Shelby Township Meijer Human Trafficking Warning,crime,0


In [50]:
# load the model
model_state_dir = 'models/distilbert-base-uncased-2023-12-13_17-42-42.pt'
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(testing_datasets))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# evaluate the model on the test data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
data_texts = data['text'].tolist()
data_labels = data['encoded_category'].tolist()

test_encodings = tokenizer(list(data_texts), truncation=True, padding=True)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(data_labels)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=32)

In [59]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.load_state_dict(torch.load(model_state_dir, map_location=device))
model.to(device)
test_inputs = test_inputs.to(device)
test_masks = test_masks.to(device)
test_labels = test_labels.to(device)

cuda


In [62]:
# put the model in evaluation mode
model.eval()

# track variables
predictions, true_labels = [], []

# predict
eval_start_time = time.time()
for batch in test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
    label_ids = batch[2].cpu().tolist()
    true_labels.append(label_ids)
eval_end_time = time.time()

In [71]:
# combine the results across all batches
predictions = np.array(predictions)
true_labels = np.array(true_labels)

In [72]:
# calculate additional metrics
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')
accuracy = accuracy_score(true_labels, predictions)
g_mean = (recall*accuracy)**0.5

In [77]:
# Log the results
logging.info(f"{model_state_dir}: Evaluation Results:")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_state_dir}: Evaluation Results:")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

models/distilbert-base-uncased-2023-12-13_17-42-42.pt: Evaluation Results:
Inference time: 8960.384637355804 seconds
Precision: 0.9597576979234526
Recall: 0.9554195515537112
F-score: 0.9563099380122453
Accuracy: 0.9554195515537112
G-mean: 0.9554195515537112
