In [1]:
import os
from pathlib import Path

import pandas as pd
import torch


In [2]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    print("⚠️ Using CPU")


Torch version: 2.5.1+cu121
CUDA available: True
Using GPU: NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
print("Current working directory:")
print(os.getcwd())

print("\nFiles here:")
print(os.listdir())


Current working directory:
c:\Users\chint\OneDrive\Documents\Newsumm\news_model

Files here:
['news_model', 'news_model.ipynb']


In [4]:
PROJECT_ROOT = Path(os.getcwd()).parent
DATA_PATH = PROJECT_ROOT / "NewsSumm Dataset.xlsx"

print("Resolved path:", DATA_PATH)
print("File exists:", DATA_PATH.exists())


Resolved path: c:\Users\chint\OneDrive\Documents\Newsumm\NewsSumm Dataset.xlsx
File exists: True


In [5]:
df = pd.read_excel(DATA_PATH)

print("Dataset shape:", df.shape)
print("Columns:")
for c in df.columns:
    print("-", c)

df.head()


Dataset shape: (348766, 6)
Columns:
- newspaper_name
- published_date

- headline
- article_text
- human_summary
- news_category


Unnamed: 0,newspaper_name,published_date\n,headline,article_text,human_summary,news_category
0,Indian Express,2020-06-01 00:00:00,Virus may be invisible enemy but COVID warrior...,Prime Minister Narendra Modi Monday hailed the...,Prime Minister of India said that the Virus ma...,National News
1,Economic Times,2013-02-11 00:00:00,"Economy can bounce back, says PM Modi","ALLAHABAD: At least 20 persons were killed, an...","In Maha Kumbh, nearly 20 persons were killed. ...",National News
2,Business Standard,2013-02-11 00:00:00,At least 20 killed in stampede in Allahabad,"At least 20 people were killed, and scores of ...",As per the sources 20 people died and scores w...,National News
3,Money Control,2013-02-11 00:00:00,Maha Kumbh: Over 20 dead in Allahabad station ...,More than 20 people were feared dead and 30 ot...,At least 20 people killed and 20 people are in...,National News
4,The Mint,2023-10-02 00:00:00,Gandhian wisdom,"This Gandhi Jayanti, we should reflect upon an...","In this article, the author reflects on Mahatm...",National News


In [6]:
df.columns = df.columns.str.strip().str.lower()
print("Normalized columns:", list(df.columns))


Normalized columns: ['newspaper_name', 'published_date', 'headline', 'article_text', 'human_summary', 'news_category']


In [7]:
COLUMN_MAP = {}

# text column
if "headline" in df.columns:
    COLUMN_MAP["headline"] = "text"
elif "text" in df.columns:
    COLUMN_MAP["text"] = "text"

# category column
if "clean_category" in df.columns:
    COLUMN_MAP["clean_category"] = "category"
elif "news_category" in df.columns:
    COLUMN_MAP["news_category"] = "category"
elif "category" in df.columns:
    COLUMN_MAP["category"] = "category"

df = df.rename(columns=COLUMN_MAP)

print("Columns after mapping:", list(df.columns))


Columns after mapping: ['newspaper_name', 'published_date', 'text', 'article_text', 'human_summary', 'category']


In [8]:
df = df.dropna(subset=["text", "category"])

df["text"] = df["text"].astype(str)
df["category"] = df["category"].astype(str)

print("Cleaned shape:", df.shape)
print(df["category"].value_counts().head())


Cleaned shape: (348755, 6)
category
Politics                48796
Business and Finance    33855
National News           33012
Local News              29853
International News      26443
Name: count, dtype: int64


In [9]:
TOP_K = 10

top_categories = df["category"].value_counts().nlargest(TOP_K).index

df["category"] = df["category"].apply(
    lambda x: x if x in top_categories else "Other"
)

print(df["category"].value_counts())


category
Other                   79046
Politics                48796
Business and Finance    33855
National News           33012
Local News              29853
International News      26443
Crime and Justice       24418
Sports                  23290
Entertainment           19898
Health and Wellness     16434
Education               13710
Name: count, dtype: int64


In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])

print("Number of classes:", len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)


Number of classes: 11
Classes: ['Business and Finance' 'Crime and Justice' 'Education' 'Entertainment'
 'Health and Wellness' 'International News' 'Local News' 'National News'
 'Other' 'Politics' 'Sports']


In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["category"])

print("Number of classes:", len(label_encoder.classes_))
print("Classes:", label_encoder.classes_)


Number of classes: 11
Classes: ['Business and Finance' 'Crime and Justice' 'Education' 'Entertainment'
 'Health and Wellness' 'International News' 'Local News' 'National News'
 'Other' 'Politics' 'Sports']


In [12]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizer loaded:", MODEL_NAME)


  from .autonotebook import tqdm as notebook_tqdm


Tokenizer loaded: distilbert-base-uncased


In [13]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("Tokenizer ready")


Tokenizer ready


In [14]:
from sklearn.model_selection import train_test_split

X = df["text"].tolist()
y = df["label"].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 313879
Test size: 34876


In [15]:
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128
    )

X_train_tokens = tokenize_texts(X_train)
X_test_tokens = tokenize_texts(X_test)

print("Tokenization complete")


Tokenization complete


In [16]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


In [17]:
train_dataset = NewsDataset(X_train_tokens, y_train)
test_dataset = NewsDataset(X_test_tokens, y_test)

print("Train dataset size:", len(train_dataset))
print("Test dataset size:", len(test_dataset))


Train dataset size: 313879
Test dataset size: 34876


In [18]:
from transformers import AutoModelForSequenceClassification

num_labels = len(label_encoder.classes_)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded with", num_labels, "classes")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 11 classes


In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./news_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=200,
    save_strategy="epoch",
    report_to="none",
    fp16=torch.cuda.is_available()
)

print("Training arguments ready")


Training arguments ready


In [21]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

print("Trainer initialized")


Trainer initialized


  trainer = Trainer(


In [25]:
trainer.train()





Step,Training Loss
200,1.2145
400,1.2027
600,1.1802
800,1.1649
1000,1.1717
1200,1.1282
1400,1.1276
1600,1.1274
1800,1.1311
2000,1.1246


TrainOutput(global_step=39236, training_loss=1.0579102994810126, metrics={'train_runtime': 5135.1089, 'train_samples_per_second': 122.248, 'train_steps_per_second': 7.641, 'total_flos': 2.0792704051047936e+16, 'train_loss': 1.0579102994810126, 'epoch': 2.0})

In [20]:
# Save final trained model
trainer.save_model("./news_model/final_model")

# Save tokenizer (CRITICAL for inference)
tokenizer.save_pretrained("./news_model/final_model")

print("Final model and tokenizer saved successfully!")


NameError: name 'trainer' is not defined

In [22]:
from datasets import Dataset

test_dataset = Dataset.from_dict({
    "input_ids": X_test_tokens["input_ids"],
    "attention_mask": X_test_tokens["attention_mask"],
    "labels": y_test
})

print("Test dataset ready:", test_dataset)


Test dataset ready: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 34876
})


In [23]:
metrics = trainer.evaluate(test_dataset)
print(metrics)


{'eval_loss': 2.3891608715057373, 'eval_model_preparation_time': 0.0061, 'eval_runtime': 88.8439, 'eval_samples_per_second': 392.554, 'eval_steps_per_second': 49.075}


In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained("./news_model/final_model")
tokenizer = AutoTokenizer.from_pretrained("./news_model/final_model")

model.to(device)
model.eval()

print("Model loaded on:", device)


Model loaded on: cuda


In [25]:
def predict_category(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    pred_label_id = outputs.logits.argmax(dim=1).item()
    return label_encoder.inverse_transform([pred_label_id])[0]


In [26]:
headline = "The film is a flop"
print("Predicted category:", predict_category(headline))


Predicted category: Entertainment
