In [99]:
#Imports 

#Basic
import pandas as pd
import numpy as np

#Models
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize

#Evaluation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary


In [100]:
#Download data

data = pd.read_csv('india-news-headlines.csv')

### Data exploration

In [101]:
data.head(5)

Unnamed: 0,publish_date,headline_category,headline_text
0,20010101,sports.wwe,win over cena satisfying but defeating underta...
1,20010102,bollywood,Raju Chacha
2,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
3,20010102,unknown,Fissures in Hurriyat over Pak visit
4,20010102,unknown,America's unwanted heading for India?


In [102]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1999 entries, 0 to 1998
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   publish_date       1999 non-null   int64 
 1   headline_category  1999 non-null   object
 2   headline_text      1999 non-null   object
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [103]:
headline_category.unique()

array(['sports.wwe', 'bollywood', 'unknown',
       'entertainment.hindi.bollywood.news', 'business.india-business',
       'city.bengaluru', 'city.delhi', 'city.patna', 'hollywood', 'india',
       'removed', 'sports.football'], dtype=object)

In [104]:
unknown_count = (data['headline_category'] == "unknown").sum()
print("Number of 'unknown' values:", unknown_count)

Number of 'unknown' values: 1975


### Model fitting

In [105]:
# Load texts
texts = data['headline_text']

# Convert to bag-of-words
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=5)
X = vectorizer.fit_transform(texts)

In [106]:
lda = LatentDirichletAllocation(
    n_components=9,
    random_state=42,
    learning_method='batch'
)

lda.fit(X)

In [107]:
# Convert your data for Gensim
texts = [t.split() for t in data['headline_text']]
id2word = Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

# Use words from your LDA model
topics_words = [[vectorizer.get_feature_names_out()[i] 
                 for i in topic.argsort()[:-10 - 1:-1]] 
                for topic in lda.components_]

coherence_model = CoherenceModel(
    topics=topics_words,
    texts=texts,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print("Topic Coherence Score:", coherence_score)

Topic Coherence Score: 0.46703584728938474


In [85]:
# Check how many bigramms and trigramms

vocab = vectorizer.get_feature_names_out()

unigrams = [w for w in vocab if len(w.split()) == 1]
bigrams = [w for w in vocab if len(w.split()) == 2]
trigrams = [w for w in vocab if len(w.split()) == 3]

print("Unigrams:", len(unigrams))
print("Bigrams:", len(bigrams))
print("Trigrams:", len(trigrams))


Unigrams: 353
Bigrams: 29
Trigrams: 17


In [113]:
# Best number of headlines

# Prepare text for coherence calculation
texts_raw = data['headline_text'].tolist()
texts_split = [t.split() for t in texts_raw]

# Gensim dictionary + corpus
id2word = Dictionary(texts_split)
corpus = [id2word.doc2bow(t) for t in texts_split]

# My TF-IDF + ngram vectorizer
feature_names = vectorizer.get_feature_names_out()

# multiple K values
k_values = [3, 5, 7, 8, 10, 12, 15]
results = []

for k in k_values:
    print(f"\nFitting LDA with K={k} topics...")

    lda_model = LatentDirichletAllocation(
        n_components=k,
        max_iter=20,
        learning_method='batch',
        random_state=42
    )
    lda_model.fit(X)

    # Extract top words per topic
    topics_words = [
        [feature_names[i] for i in topic.argsort()[:-15 - 1:-1]]
        for topic in lda_model.components_
    ]

    # Compute coherence
    cm = CoherenceModel(
        topics=topics_words,
        texts=texts_split,
        dictionary=id2word,
        coherence='c_v'
    )
    coherence = cm.get_coherence()

    print(f"Coherence for K={k}: {coherence:.4f}")

    results.append((k, coherence))



Fitting LDA with K=3 topics...
Coherence for K=3: 0.6355

Fitting LDA with K=5 topics...
Coherence for K=5: 0.6496

Fitting LDA with K=7 topics...
Coherence for K=7: 0.6555

Fitting LDA with K=8 topics...
Coherence for K=8: 0.6412

Fitting LDA with K=10 topics...
Coherence for K=10: 0.6286

Fitting LDA with K=12 topics...
Coherence for K=12: 0.6465

Fitting LDA with K=15 topics...
Coherence for K=15: 0.6249


In [108]:
#Use TF-IDF instead of raw counts AND try bigrams and trigramms

# Prepare your text data
texts = data['headline_text']

# TF-IDF Vectorizer with Bigrams
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,3),
    max_df=0.9,
    min_df=2      # ← use 2 instead of 5
)


X = vectorizer.fit_transform(texts)

print("TF-IDF matrix shape:", X.shape)  # for debugging


TF-IDF matrix shape: (1999, 3707)


In [109]:
lda = LatentDirichletAllocation(
    n_components=7,      # number of topics — adjust later
    max_iter=20,          # more iterations for better convergence
    random_state=42,
    learning_method='batch'
)

lda.fit(X)


In [111]:
# Convert your data for Gensim
texts = [t.split() for t in data['headline_text']]
id2word = Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

# Use words from your LDA model
topics_words = [[vectorizer.get_feature_names_out()[i] 
                 for i in topic.argsort()[:-10 - 1:-1]] 
                for topic in lda.components_]

coherence_model = CoherenceModel(
    topics=topics_words,
    texts=texts,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print("Topic Coherence Score:", coherence_score)

Topic Coherence Score: 0.5263294630056531


### Converting prediction to the actual result

In [112]:
# Display prediction

def show_topics(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic {topic_idx}: {', '.join(top_features)}")

feature_names = vectorizer.get_feature_names_out()
show_topics(lda, feature_names)


Topic 0: govt, city, man, traffic, decision, good, need, peace, men, cm
Topic 1: time, goa, court, star, water, quake, law, bhuj, mother, new
Topic 2: census, rs, hc, minister, child, disaster, times, win, day, rescue
Topic 3: life, death, talks, bangalore, india, work, night, today, officials, indian
Topic 4: new, india, gets, khan, reality, quake, power, karnataka, relief, face
Topic 5: year, safe, govt, shock, state, govinda, air, economic, gujarat, caught
Topic 6: straight answers, straight, answers, pm, ceasefire, just, people, help, boy, medical


In [68]:
topic_distribution = lda.transform(X)
data['topic'] = topic_distribution.argmax(axis=1)

In [125]:
# Human labeling 

topic_distribution = lda.transform(X)
data['topic'] = topic_distribution.argmax(axis=1)

topic_labels = {
    0: "Bollywood",
    1: "Gov Policy",
    2: "State Politics/Protests",
    3: "Sports",
    4: "Administration/Social",
    5: "Sport",
    6: "Indian-Pakistan",
    7: "Social Issues",
    8: "Urban Development/Business",
    9: "International Politics/Economy",
}

data["topic_label"] = data["topic"].map(topic_labels)

In [126]:
topic_distribution = lda.transform(X)
data['topic'] = topic_distribution.argmax(axis=1)


In [130]:
data.head(10)

Unnamed: 0,publish_date,headline_category,headline_text,topic,topic_label
0,20010101,sports.wwe,win over cena satisfying but defeating underta...,5,Sport
1,20010102,bollywood,Raju Chacha,0,Bollywood
2,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...,6,Indian-Pakistan
3,20010102,unknown,Fissures in Hurriyat over Pak visit,6,Indian-Pakistan
4,20010102,unknown,America's unwanted heading for India?,2,State Politics/Protests
5,20010102,unknown,For bigwigs; it is destination Goa,1,Gov Policy
6,20010102,unknown,Extra buses to clear tourist traffic,4,Administration/Social
7,20010102,unknown,Dilute the power of transfers; says Riberio,2,State Politics/Protests
8,20010102,unknown,Focus shifts to teaching of Hindi,0,Bollywood
9,20010102,unknown,IT will become compulsory in schools,4,Administration/Social
