In [16]:
import pandas as pd
import numpy as np
import os

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split


from tabulate import tabulate
from tqdm import trange
import random

In [5]:
# Define the path to the main folder
main_folder = './data/BBC News Summary/News Articles'

data = []  # List to store the data from text files
subfolder_names = []  # List to store the names of subfolders


# Dictionary for converting the classes to numeric representation for the usage in the model
classes_mapping = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

for subdir in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subdir)
    if os.path.isdir(subfolder_path):
        subfolder_names.append(subdir)  # Append subfolder name to the list
        
        # Iterate over the text files in the subfolder
        for file in os.listdir(subfolder_path):
            file_path = os.path.join(subfolder_path, file)
            if file.endswith('.txt'):
                try:
                    # Read the contents of the text file with UTF-8 encoding
                    with open(file_path, 'r', encoding='utf-8') as f:
                        file_contents = f.read()
                    
                    data.append((classes_mapping[subdir], file_contents))  # Append class name as number and file contents to the list
                except UnicodeDecodeError:
                    print(f"Error reading file: {file_path}. Skipping...")
                except Exception as e:
                    print(f"Error occurred while processing file: {file_path}. Error message: {str(e)}")

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Class', 'Text'])

# Display the DataFrame
df.head()

Error reading file: ./data/BBC News Summary/News Articles/sport/199.txt. Skipping...


Unnamed: 0,Class,Text
0,1,Musicians to tackle US red tape\n\nMusicians' ...
1,1,"U2's desire to be number one\n\nU2, who have w..."
2,1,Rocker Doherty in on-stage fight\n\nRock singe...
3,1,Snicket tops US box office chart\n\nThe film a...
4,1,Ocean's Twelve raids box office\n\nOcean's Twe...


In [25]:
df = df.sample(n=500, random_state=1)

In [26]:
text = df.Text.values
classes = df.Class.values

In [27]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    # set number of classes to classes in df['Class']
    num_labels = len(df['Class'].unique()),
    output_attentions = False,
    output_hidden_states = False,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [39]:
token_id = []
attention_masks = []

def preprocessing(text, tokenizer):

  return tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 128,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
classes = torch.tensor(classes)

  classes = torch.tensor(classes)


In [40]:
test_ratio = 0.2

batch_size = 16

# Indices of the train and test splits stratified by labels
train_idx, test_idx = train_test_split(
    np.arange(len(classes)),
    test_size = test_ratio,
    shuffle = True,
    stratify = classes)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          classes[train_idx])

test_set = TensorDataset(token_id[test_idx], 
                        attention_masks[test_idx], 
                        classes[test_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_set,
            sampler = SequentialSampler(test_set),
            batch_size = batch_size
        )

In [41]:
optimizer = torch.optim.AdamW(model.parameters(), 
                              # set learning rate
                              lr = 5e-5,
                              eps = 1e-08
                              )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_epochs = 2

for epoch in range(num_epochs):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    tr_accuracy = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    avg_loss = tr_loss / nb_tr_steps

    print(f"Epoch {epoch+1}:")
    print(f"  Loss: {avg_loss:.4f}")

Epoch 1:
  Loss: 0.1248
Epoch 2:
  Loss: 0.0443


In [42]:
# Set model to evaluation mode
model.eval()

total_correct = 0
total_samples = 0

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        # Forward pass
        eval_output = model(b_input_ids, 
                        token_type_ids = None, 
                        attention_mask = b_input_mask)
        logits = eval_output.logits
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics

        _, predicted_labels = torch.max(logits, 1)

        total_correct += (predicted_labels == b_labels).sum().item()
        total_samples += b_labels.size(0)

accuracy = total_correct / total_samples
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9800


In [43]:
import pickle
# save the model to disk
filename = './models/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))