In [1]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch

In [4]:
# Load the CSV files (adjust the path if needed)
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Drop rows with any missing values
train_df = train_df.dropna()
test_df = test_df.dropna()

# Remove header rows accidentally read as data (if present)
train_df = train_df[train_df["Class Index"] != "Class Index"]
test_df = test_df[test_df["Class Index"] != "Class Index"]

# Convert labels to int and rename to "labels"
train_df["labels"] = train_df["Class Index"].astype(int)
test_df["labels"] = test_df["Class Index"].astype(int)

# Drop unused columns
train_df = train_df.drop(columns=["Class Index", "Title"])
test_df = test_df.drop(columns=["Class Index", "Title"])

# Preview
print(train_df.head())


                                         Description  labels
0  Reuters - Short-sellers, Wall Street's dwindli...       3
1  Reuters - Private investment firm Carlyle Grou...       3
2  Reuters - Soaring crude prices plus worries\ab...       3
3  Reuters - Authorities have halted oil export\f...       3
4  AFP - Tearaway world oil prices, toppling reco...       3


In [5]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["Description"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [9]:
# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [16]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Get the number of unique labels from the train dataset
num_labels = len(set(train_dataset["labels"]))

# Load pre-trained BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from sklearn.metrics import accuracy_score
import numpy as np

# Metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [18]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 