In [45]:
import os 
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

In [46]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install transformers pandas scikit-learn pyarrow accelerate transformers[torch] ipywidgets tqdm 

Looking in indexes: https://download.pytorch.org/whl/cu121Note: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
import pandas as pd
import torch
import logging
import os
import time
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DistilBertTokenizerFast, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime

In [48]:
torch.cuda.is_available()

True

In [49]:
log_filename = f'logs/training_log_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'
logging.basicConfig(filename=log_filename, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)

In [50]:
def load_category_data(file_path: str, category: str):
    df = pd.read_feather(file_path)
    # add a column for the category
    df['category'] = category
    return df

In [51]:
model_map = {
  'bert-base-uncased': {
    'model': BertForSequenceClassification,
    'tokenizer': BertTokenizer,
  },
  'distilbert-base-uncased': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizerFast,
  },
  'distilbert-base-uncased-finetuned-sst-2-english': {
    'model': DistilBertForSequenceClassification,
    'tokenizer': DistilBertTokenizerFast,
  },
  'albert-base-v2': {
    'model': AlbertForSequenceClassification,
    'tokenizer': AlbertTokenizer,
  },
  'roberta-base': {
    'model': RobertaForSequenceClassification,
    'tokenizer': RobertaTokenizer,
  },
}

In [52]:
categories = {
  'crime': ['FA-KES-Dataset', 'snope'],
  'health': ['covid_claims', 'covid_fake_news_dataset', 'covid_FNIR'],
  'politics': ['politifact_dataset', 'liar_dataset', 'fake_news_dataset', 'pheme'],
  'science': ['climate_dataset'],
  'social_media': ['isot_dataset', 'gossipcop']
}

In [53]:
training_datasets = {
  'crime': ['FA-KES-Dataset'],
  'health': ['covid_FNIR', 'covid_fake_news_dataset'],
  'politics': ['politifact_dataset', 'fake_news_dataset'],
  'science': ['climate_dataset'],
  'social_media': ['isot_dataset']
}

testing_datasets = {
  'crime': ['snope'],
  'health': ['covid_claims'],
  'politics': ['pheme', 'liar_dataset', ],
  'science': ['climate_dataset'],
  'social_media': ['gossipcop']
}

In [54]:
# show info on the categories and datasets
training_data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in training_datasets.items() for dataset in datasets])
testing_data = pd.concat([load_category_data(os.path.join(os.path.realpath('.'), f'..\data\{category}\{dataset}.feather'), category) for category, datasets in testing_datasets.items() for dataset in datasets])

print(training_data.groupby('category').count())
print(testing_data.groupby('category').count())

               text  label  metadata  title
category                                   
crime           804    804       804      0
health        10707  10707     10707   3118
politics      41913  41952     41952      0
science         907    907       907      0
social_media  44898  44898     44898      0
               text  label  metadata  title  author
category                                           
crime           315    315       315      0       0
health         2814   2821      2821   2821       0
politics      19260  19260     19260      0    6424
science         907    907       907      0       0
social_media  22140  22140     22140      0       0


In [55]:
# keep only the text and category columns
training_data = training_data[['text', 'category']]
training_data.dropna(inplace=True)
testing_data = testing_data[['text', 'category']]
testing_data.dropna(inplace=True)

In [56]:
# log the dataset information
logging.info(f'Training data: {training_data.shape}')
print(f'Training data: {training_data.shape}')
logging.info(training_data.groupby('category').count())
print(training_data.groupby('category').count())

logging.info(f'Testing data: {testing_data.shape}')
print(f'Testing data: {testing_data.shape}')
logging.info(testing_data.groupby('category').count())
print(testing_data.groupby('category').count())

Training data: (99229, 2)
               text
category           
crime           804
health        10707
politics      41913
science         907
social_media  44898
Testing data: (45436, 2)
               text
category           
crime           315
health         2814
politics      19260
science         907
social_media  22140


In [57]:
training_data['encoded_category'] = training_data['category'].astype('category').cat.codes
training_data.head()

testing_data['encoded_category'] = testing_data['category'].astype('category').cat.codes
testing_data.head()

Unnamed: 0,text,category,encoded_category
174,Was an Italian Economist Removed from a Plane ...,crime,0
8524,Shelby Township Meijer Human Trafficking Warning,crime,0
8523,Shelby Township Meijer Human Trafficking Warning,crime,0
8522,Shelby Township Meijer Human Trafficking Warning,crime,0
8521,Shelby Township Meijer Human Trafficking Warning,crime,0


In [58]:
df_texts = training_data['text'].to_list()
df_labels = training_data['encoded_category'].to_list()

In [59]:
# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_texts, df_labels, test_size=0.2, random_state=7623)

In [60]:
model_name = 'distilbert-base-uncased'
model_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logging.info('====================================================')
# load the tokenizer and model
tokenizer = model_map[model_name]['tokenizer'].from_pretrained(model_name)
model = model_map[model_name]['model'].from_pretrained(model_name, num_labels=len(categories))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
# create the encodings
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [62]:
# Convert the encodings to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create a DataLoader for training and testing
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
data_test_dataloader = DataLoader(test_data, batch_size=8)

In [63]:
# Move the model and data to the GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
test_inputs, test_masks, test_labels = test_inputs.to(device), test_masks.to(device), test_labels.to(device)

In [64]:
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 1

# Fine-tune the pre-trained BERT model
train_start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs, masks, labels = batch
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
    logging.info(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
train_end_time = time.time()

  2%|▏         | 174/9923 [19:34<18:16:55,  6.75s/it]


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), f'models/{model_name}-{model_time}.pt')

In [None]:
# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in data_test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

In [None]:
# Calculate additional metrics
precision = precision_score(test_labels.cpu(), predictions, average='macro')
recall = recall_score(test_labels.cpu(), predictions, average='macro')
f1 = f1_score(test_labels.cpu(), predictions, average='macro')
accuracy = accuracy_score(test_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

In [None]:
# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results:")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-14_10-10-31 with 7 epochs: Evaluation Results:
Training time: 1758.0164849758148 seconds
Inference time: 31.92399549484253 seconds
Precision: 0.9687700728774725
Recall: 0.9679539862164452
F-score: 0.9682843239433515
Accuracy: 0.9682352941176471
G-mean: 0.9680946299492776


In [None]:
# Evaluate the model further by testing it the testing datasets
# Create the test data
data_test_texts = testing_data['text'].to_list()
data_test_labels = testing_data['encoded_category'].to_list()

# Tokenize the data
data_test_encodings = tokenizer(list(data_test_texts), truncation=True, padding=True)

# Convert the encodings to PyTorch tensors
data_test_inputs = torch.tensor(data_test_encodings['input_ids'])
data_test_masks = torch.tensor(data_test_encodings['attention_mask'])

# Create a DataLoader for testing
data_test_data = TensorDataset(data_test_inputs, data_test_masks)
data_test_dataloader = DataLoader(data_test_data, batch_size=8)

In [None]:

# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in data_test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

# Calculate additional metrics
precision = precision_score(data_test_labels, predictions, average='macro')
recall = recall_score(data_test_labels, predictions, average='macro')
f1 = f1_score(data_test_labels, predictions, average='macro')
accuracy = accuracy_score(data_test_labels, predictions)
g_mean = (recall*accuracy)**0.5

# Log the results
logging.info(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")

# Print the evaluation results
print(f"{model_name} {model_time} with {epochs} epochs: Evaluation Results (completely new data):")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")

distilbert-base-uncased 2023-12-14_10-10-31 with 7 epochs: Evaluation Results (completely new data):
Training time: 1758.0164849758148 seconds
Inference time: 38.20261216163635 seconds
Precision: 0.9674714467233013
Recall: 0.9678126143521029
F-score: 0.967560059860937
Accuracy: 0.968
G-mean: 0.9679063026413433


: 