# **Summary of the results of the models**

## Import

In [7]:
import torch
import torch.nn as nn
from transformers import BertModel, DistilBertModel
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizer, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import random
from tqdm import tqdm
import numpy as np
import os
import re
import time
import datetime

random.seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Utils

In [8]:
class IrishTimesDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)  # Return the length of the texts

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])  # Access the element using .iloc[idx]
        label = self.labels[idx]

        encoded_inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        input_ids = encoded_inputs['input_ids'].squeeze()
        attention_mask = encoded_inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label)
        }

## Load data

In [9]:
# Load the dataset
df = pd.read_csv('data/ireland-news-headlines-cleaned-6.csv')
df.rename(columns={'clean_headline_text': 'text', 'headline_category': 'category'}, inplace=True)

df = df[['text', 'category']]  # Keep only 'text' and 'category' columns
class_counts = df['category'].value_counts()

df = df.dropna(subset=['text', 'category'])  # Remove rows with missing values in 'text' or 'category'
df.drop_duplicates(inplace=True)

# Shuffle the data
shuffled_data = df.sample(frac=1).reset_index(drop=True)

# Define the train-test split ratio
train_ratio = 0.9  # 90% for training, 10% for testing

# Calculate the split index
split_index = int(train_ratio * len(shuffled_data))

# Split the data into train and test sets
train_df = shuffled_data[:split_index]
test_df = shuffled_data[split_index:]

## Load Models

In [10]:
# Load the pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_classifier = nn.Linear(bert_model.config.hidden_size, df['category'].nunique())
unique_classes = df['category'].unique()
print(unique_classes)

# Load model dicts
bert_path = 'models/bset_model/BERT_6.pt'
bert_model.load_state_dict(torch.load(bert_path, map_location=device))
bert_model.to(device)
bert_model.eval()
bert_classifer_path = 'models/bset_model/Classifier_6.pt'
bert_classifier.load_state_dict(torch.load(bert_classifer_path, map_location=device))
bert_classifier.to(device)
bert_classifier.eval()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['news' 'culture' 'opinion' 'business' 'sport' 'lifestyle']


Linear(in_features=768, out_features=6, bias=True)

## Arrange the dataset

In [11]:
label_encoder = LabelEncoder()
y_test = label_encoder.fit_transform(test_df['category'])
# Create data loaders
test_dataset = IrishTimesDataset(test_df['text'], y_test, bert_model_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=128)

## Models Test

### Evaluate the best bert model
BERT - 6 Classes, End-to-End, cleaned and balanced data.

In [12]:
bert_model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, total=len(test_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        logits = bert_classifier(pooled_output)
        predicted_labels = logits.argmax(dim=1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(true_labels, predictions)
print('Test Accuracy on the best BERT Model:', test_accuracy)

100%|██████████| 1175/1175 [04:20<00:00,  4.52it/s]

Test Accuracy on the best BERT Model: 0.8332224499028661





-------------------------------------------------------------------
------------------------------------------------------------------------