# Trans4News: Multiclass News Classifier

## Install Packages

In [1]:
!pip install --upgrade nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [5]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.1-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.1-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.1


In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.2-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading huggingface_hub-0.20.2-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

## Import Packages

In [68]:
import os
import random
import re
import subprocess
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F

warnings.filterwarnings("ignore")

from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, random_split

# from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from transformers import (
    BertConfig,
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    DistilBertConfig,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
)

In [8]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

## Configuration Class

In [50]:
class Config:
    def __init__(self) -> None:
        self.seed = 42
        self.exp_name = "Trans4News: Multiclass News Classifier"
        self.model_name = "bert"
        self.pre_model = "bert-base-uncased"
        self.pre_distil_model = "distilbert-base-uncased"
        self.train_PATH = "train.csv"
        self.test_PATH = "test.csv"
        self.max_length = 50
        self.num_workers = 0
        self.epochs = 5
        self.patience = 2
        self.num_classes = 4
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.batch_size = 32
        self.shuffle_train = True
        self.dropout = 0.2

## Set Random Seed

In [51]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(">> SEEDING DONE")


set_seed(Config().seed)

>> SEEDING DONE


## Data Loading and Preprocessing 

In [52]:
# Save all infos about training data in one place
class DataContext:
    def __init__(self, config) -> None:
        self.bert_tokenizer = BertTokenizer.from_pretrained(config.pre_model)
        self.distil_bert_tokenizer = DistilBertTokenizer.from_pretrained(
            config.pre_distil_model
        )
        self.vectorizer = None
        self.train_dataset = None
        self.valid_dataset = None
        self.test_dataset = None
        self.train_dataloader = None
        self.valid_dataloader = None
        self.test_dataloader = None
        self.df_train = None
        self.df_test = None
        self.df = None

    # preprocessing method for all texts
    def preprocess_texts(self) -> None:
        preprocessed_texts_ls = []
        for ix, row in self.df.iterrows():
            text = row.text

            # Convert to lowercase
            text = text.lower()

            # Remove URLs
            text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

            # Remove mentions and hashtags
            text = re.sub(r"@\w+|\#", "", text)

            # Remove special characters and punctuation
            text = re.sub(r"[^\w\s]", "", text)

            # Remove spaces
            text = re.sub(r"\s+", " ", text)

            # Remove unnecessary dots
            text = re.sub(r"\.{2,}", ".", text)

            # Remove dots at the beginning or end of the sentence
            text = text.strip(".")

            # Remove spaces at the beginning or end of the sentence
            text = text.strip(" ")

            preprocessed_texts_ls.append(text)

        # Create new columns in our main df
        self.df["preprocessed_news"] = preprocessed_texts_ls
        return

In [53]:
# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        # Subtract 1 from each value in the 'labels' column
        self.dataframe["labels"] = self.dataframe["labels"] - 1

    def __len__(self):
        return len(self.dataframe.labels)

    def __getitem__(self, idx):
        if "token_type_ids" in self.dataframe.columns.tolist():
            sample = {
                "input_ids": self.dataframe["input_ids"].iloc[idx],
                "attention_mask": self.dataframe["attention_mask"].iloc[idx],
                "token_type_ids": self.dataframe["token_type_ids"].iloc[idx],
                "label": torch.tensor(
                    self.dataframe["labels"].iloc[idx], dtype=torch.long
                ),
            }
        else:
            sample = {
                "input_ids": self.dataframe["input_ids"].iloc[idx],
                "attention_mask": self.dataframe["attention_mask"].iloc[idx],
                "label": torch.tensor(
                    self.dataframe["labels"].iloc[idx], dtype=torch.long
                ),
            }
        return sample

In [54]:
def do_tokenization(config, context, model_name):
    # Tokenize each news in the DataFrame
    def tokenize_news(news_text):
        if model_name == "BertToken":
            tokens = context.bert_tokenizer(
                news_text,
                truncation=True,
                padding="max_length",
                max_length=config.max_length,
                return_tensors="pt",
            )
            for key in ["input_ids", "attention_mask", "token_type_ids"]:
                tokens[key] = torch.LongTensor((tokens[key]))
        elif model_name == "DistilBertToken":
            tokens = context.distil_bert_tokenizer(
                news_text,
                truncation=True,
                padding="max_length",
                max_length=config.max_length,
                return_tensors="pt",
            )
            for key in ["input_ids", "attention_mask"]:
                tokens[key] = torch.LongTensor((tokens[key]))
        return tokens

    tokenized_news = context.df["preprocessed_news"].apply(tokenize_news)
    return tokenized_news

In [55]:
# Dataset can be either train, test or valid
def load_dataset(config) -> DataContext:
    context = DataContext(config)

    # Read the dataset from config.train_PATH and config.test_PATH
    context.df_train = pd.read_csv(config.train_PATH, encoding="utf-8")

    context.df_test = pd.read_csv(config.test_PATH, encoding="utf-8")

    # Merge DataFrames
    context.df = pd.concat([context.df_train, context.df_test], ignore_index=True)

    # Assuming df is your DataFrame
    context.df["text"] = context.df["Title"] + " " + context.df["Description"]

    # Rename 'Class Index' to 'label'
    context.df = context.df.rename(columns={"Class Index": "labels"})

    # Print a log
    print(">> CSV LOADING DONE.")

    # Preprocess and create encodings for the dataset
    context.preprocess_texts()

    # Print a log
    print(">> DATA PREPROCESSING DONE.")

    # Tokenize each news and add the 'input_ids', 'attention_mask', and 'token_type_ids' columns
    tokenized_news = do_tokenization(
        config, context, "DistilBertToken"
    )  # you can change it here for BertToken
    context.df = pd.concat([context.df, pd.DataFrame(tokenized_news.tolist())], axis=1)

    # Print a log
    print(">> DATA TOKENIZATION DONE.")

    # Create a custom dataset instance
    dataset = CustomDataset(context.df)

    # Define the sizes for train, validation, and test sets
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size

    # Split the dataset into train, validation, and test sets
    train_dataset, val_dataset, test_dataset = random_split(
        dataset,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(config.seed),
    )

    # Create context dataset for hyperparameter tuning
    context.train_dataset = train_dataset
    context.valid_dataset = val_dataset
    context.test_dataset = test_dataset

    # Create data loaders for train, validation, and test sets
    context.train_dataloader = DataLoader(
        train_dataset,
        batch_size=config.batch_size,
        shuffle=config.shuffle_train,
        num_workers=config.num_workers,
    )
    context.valid_dataloader = DataLoader(
        val_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
    )
    context.test_dataloader = DataLoader(
        test_dataset,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers,
    )

    # Print a log
    print(">> DATALOADER AND VALIDATION FRAMEWORK CREATED.")

    return context

In [56]:
config_obj = Config()

In [57]:
context = load_dataset(config_obj)

>> CSV LOADING DONE.
>> DATA PREPROCESSING DONE.
>> DATA TOKENIZATION DONE.
>> DATALOADER AND VALIDATION FRAMEWORK CREATED.


In [58]:
context.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127600 entries, 0 to 127599
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   labels             127600 non-null  int64 
 1   Title              127600 non-null  object
 2   Description        127600 non-null  object
 3   text               127600 non-null  object
 4   preprocessed_news  127600 non-null  object
 5   attention_mask     127600 non-null  object
 6   input_ids          127600 non-null  object
dtypes: int64(1), object(6)
memory usage: 6.8+ MB


In [189]:
def showLengthCount(context):
    df = context.df.copy()
    for i, row in enumerate(df.iterrows()):
        df.loc[i, "length"] = len(df.loc[i, "preprocessed_news"].split())
    print("Count of news : ", len(df))
    labels = list(df.labels.unique())
    for label in labels:
        df_label = df[df["labels"] == label]
        print(f"Max length for class: {label} is : {df_label.length.unique().max()}")

In [190]:
showLengthCount(context)

Count of news :  127600
Max length for class: 2 is : 134.0
Max length for class: 3 is : 177.0
Max length for class: 1 is : 148.0
Max length for class: 0 is : 143.0


## Model Definition & Model Training

In [60]:
# Instantiate a Tensorboard SummaryWriter for logging
writer = SummaryWriter()

for model_name in [
    "DistilBertForSequenceClassification",
    "BertForSequenceClassification",
]:
    for lr in [1e-5, 1e-3]:
        # Model definition
        if model_name == "BertForSequenceClassification":
            # Bert configuration
            config_bert = BertConfig.from_pretrained(
                config_obj.pre_model,
                num_labels=config_obj.num_classes,
            )
            model = BertForSequenceClassification(config_bert)
        elif model_name == "DistilBertForSequenceClassification":
            # DistilBert configuration
            config_distilbert = DistilBertConfig.from_pretrained(
                config_obj.pre_model,
                num_labels=config_obj.num_classes,
            )
            model = DistilBertForSequenceClassification(config_distilbert)

        # Model to device
        model = model.to(config_obj.device)

        # Define optimizer and criterion
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        # Training loop
        best_val_loss = float("inf")
        early_stopping_counter = 0
        for epoch in range(config_obj.epochs):
            model.train()
            train_loss = 0.0
            for batch in tqdm(
                context.train_dataloader, desc=f"Epoch {epoch + 1}/{config_obj.epochs}"
            ):
                input_ids = batch["input_ids"].squeeze(1).to(config_obj.device)
                attention_mask = (
                    batch["attention_mask"].squeeze(1).to(config_obj.device)
                )
                labels = batch["label"].to(config_obj.device)

                optimizer.zero_grad()

                if model_name == "DistilBertForSequenceClassification":
                    outputs = model(input_ids, attention_mask)
                elif model_name == "BertForSequenceClassification":
                    token_type_ids = (
                        batch["token_type_ids"].squeeze(1).to(config_obj.device)
                    )
                    outputs = model(input_ids, attention_mask, token_type_ids)

                loss = criterion(outputs.logits, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            # Validation
            model.eval()
            val_loss = 0.0
            y_true = []
            y_pred = []

            with torch.no_grad():
                for batch in tqdm(context.valid_dataloader, desc=f"Validation:"):
                    input_ids = batch["input_ids"].squeeze(1).to(config_obj.device)
                    attention_mask = (
                        batch["attention_mask"].squeeze(1).to(config_obj.device)
                    )
                    labels = batch["label"].to(config_obj.device)

                    if model_name == "DistilBertForSequenceClassification":
                        outputs = model(input_ids, attention_mask)
                    elif model_name == "BertForSequenceClassification":
                        token_type_ids = (
                            batch["token_type_ids"].squeeze(1).to(config_obj.device)
                        )
                        outputs = model(input_ids, attention_mask, token_type_ids)

                    loss = criterion(outputs.logits, labels)
                    val_loss += loss.item()

                    y_true.extend(labels.cpu().numpy())
                    y_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

            val_loss /= len(context.valid_dataloader)
            accuracy_val = accuracy_score(y_true, y_pred)
            f1_val = f1_score(y_true, y_pred, average="micro")

            print(
                f"Epoch {epoch + 1}/{config_obj.epochs}, Train Loss: {train_loss}, Validation Loss: {val_loss}, Validation Accuracy: {accuracy_val}, Validation F1: {f1_val}"
            )

            # Tensorboard logging
            writer.add_scalar("Loss/Train", train_loss, epoch)
            writer.add_scalar("Loss/Val", val_loss, epoch)
            writer.add_scalar("Accuracy/Val", accuracy_val, epoch)
            writer.add_scalar("F1/Val", f1_val, epoch)

            # Early stopping and model checkpoint
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                early_stopping_counter = 0
                torch.save(
                    model.state_dict(), f"best_model_{model_name}_{lr}.bin"
                )  # Save the best model
            else:
                early_stopping_counter += 1

            if early_stopping_counter >= config_obj.patience:
                print("Early stopping. Training stopped.")
                break

# Close the Tensorboard SummaryWriter
writer.close()

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


Epoch 1/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 1/5, Train Loss: 1379.465988598764, Validation Loss: 0.3294312631670939, Validation Accuracy: 0.8881922675026124, Validation F1: 0.8881922675026124


Epoch 2/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 2/5, Train Loss: 778.7553430832922, Validation Loss: 0.2871452818417887, Validation Accuracy: 0.9029258098223616, Validation F1: 0.9029258098223615


Epoch 3/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 3/5, Train Loss: 634.8332336228341, Validation Loss: 0.2677471737046174, Validation Accuracy: 0.9102403343782655, Validation F1: 0.9102403343782655


Epoch 4/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 4/5, Train Loss: 537.0085411891341, Validation Loss: 0.272242692437564, Validation Accuracy: 0.9078369905956113, Validation F1: 0.9078369905956113


Epoch 5/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


Epoch 5/5, Train Loss: 468.593920183368, Validation Loss: 0.3084229033017447, Validation Accuracy: 0.9032915360501568, Validation F1: 0.9032915360501568
Early stopping. Training stopped.


Epoch 1/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 1/5, Train Loss: 3881.613509297371, Validation Loss: 1.3863279548829703, Validation Accuracy: 0.24869383490073146, Validation F1: 0.24869383490073146


Epoch 2/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 2/5, Train Loss: 3872.951962351799, Validation Loss: 1.3863223424936972, Validation Accuracy: 0.24843260188087773, Validation F1: 0.24843260188087773


Epoch 3/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 3/5, Train Loss: 3871.3625857830048, Validation Loss: 1.3863081145963208, Validation Accuracy: 0.24869383490073146, Validation F1: 0.24869383490073146


Epoch 4/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 4/5, Train Loss: 3870.7452610731125, Validation Loss: 1.386400209643407, Validation Accuracy: 0.24869383490073146, Validation F1: 0.24869383490073146


Epoch 5/5:   0%|          | 0/2792 [00:00<?, ?it/s]

Validation::   0%|          | 0/599 [00:00<?, ?it/s]

Epoch 5/5, Train Loss: 3871.361182808876, Validation Loss: 1.386343927932701, Validation Accuracy: 0.24843260188087773, Validation F1: 0.24843260188087773
Early stopping. Training stopped.


## Findings

DistilBertForSequenceClassification performed well with 1e-5 learning rate.

## Evaluation on Test-set

In [77]:
# Instantiate the DistilBERT model with the number of classes in your dataset
config_distilbert = DistilBertConfig.from_pretrained(
    config_obj.pre_model,
    num_labels=config_obj.num_classes,
)
best_model = DistilBertForSequenceClassification(config_distilbert)


# Load the trained model state_dict
best_model.load_state_dict(torch.load("best_model_DistilBertForSequenceClassification_1e-05.bin"))

best_model = best_model.to(config_obj.device)

best_model

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-11): 12 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)

In [79]:
# Evaluate the model on the test set
best_model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in tqdm(
                context.test_dataloader, desc=f"Epoch {epoch + 1}/{config_obj.epochs}"
            ):
        input_ids = batch['input_ids'].squeeze(1).to(config_obj.device)
        attention_mask = batch['attention_mask'].squeeze(1).to(config_obj.device)
        labels = batch['label'].to(config_obj.device)
        
        outputs = best_model(input_ids, attention_mask)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

# Calculate and print metrics
accuracy_test = accuracy_score(y_true, y_pred)
f1_test = f1_score(y_true, y_pred, average='micro')
classification_report_test = classification_report(y_true, y_pred)

print(f"Test Accuracy: {accuracy_test}")
print(f"Test F1 Score: {f1_test}")
print("Classification Report:\n", classification_report_test)

Epoch 5/5:   0%|          | 0/599 [00:00<?, ?it/s]

Test Accuracy: 0.9108672936259143
Test F1 Score: 0.9108672936259143
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.89      0.91      4792
           1       0.95      0.98      0.97      4767
           2       0.88      0.88      0.88      4843
           3       0.88      0.90      0.89      4738

    accuracy                           0.91     19140
   macro avg       0.91      0.91      0.91     19140
weighted avg       0.91      0.91      0.91     19140

