In [2]:
import os
import glob
import pathlib
import pandas as pd
import string
import nltk
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
# defining function that contains punctuation removal
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

## Extracting files


In [4]:
def extract_files(folder_path):
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    dataframes = []

    for file in csv_files:
        df = pd.read_csv(file)
        dataframes.append(df)
    return dataframes


extracted = extract_files("Topic_modelling")

In [5]:
alldata = pd.DataFrame()
for listitems in extracted:
    alldata = pd.concat([alldata, listitems])

Remove punctuation, numbers and stopwords from the text. Then, apply stemming to the words.

In [6]:
alldata["remove_puntuations"] = alldata["headlines"].apply(lambda x: remove_punctuation(x))
alldata["to_lower"] = alldata["remove_puntuations"].apply(lambda x: x.lower())

### BerTopic Modeling


In [10]:
topic_model_df = pd.DataFrame()
topic_model_df["headlines"] = alldata["to_lower"]

In [11]:
# Load api key
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [103]:
import openai
from bertopic import BERTopic
from bertopic.representation import ZeroShotClassification
#from bertopic.representation import OpenAI
from transformers.pipelines import pipeline
#from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from umap import UMAP


client = openai.OpenAI(api_key=api_key)

# Define the models
#representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
candidate_topics = ['business', 'politics', 'sports', 'health', 'technology', 'entertainment', 'science', 'world', 'economy', 'education']
representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli")
#embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model = pipeline("feature-extraction", model="distilbert-base-cased")
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=1.5, 
                 metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean',
                        cluster_selection_method='eom', 
                        prediction_data=True)

# NOTE: A higher min_cluster_size will lead to a more conservative topic extraction

loading configuration file config.json from cache at C:\Users\abiro\.cache\huggingface\hub\models--facebook--bart-large-mnli\snapshots\d7645e127eaf1aefc7862fd59a17a5aa8558b8ce\config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment

In [95]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 1))

In [104]:
# Train the model
model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

    top_n_words=5,
    verbose=True
)



In [105]:
topics, probs = model.fit_transform(topic_model_df["headlines"])

2024-10-15 22:48:31,091 - BERTopic - Embedding - Transforming documents to embeddings.
100%|██████████| 10000/10000 [11:11<00:00, 14.90it/s]
2024-10-15 22:59:42,366 - BERTopic - Embedding - Completed ✓
2024-10-15 22:59:42,366 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm

divide by zero encountered in power



ValueError: min_dist must be less than or equal to spread

In [101]:
model.get_document_info(topic_model_df["headlines"]).head(10)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,nirmala sitharaman to equal morarji desai’s re...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
1,‘will densify network want to be at least no 2...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
2,air india group to induct an aircraft every si...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
3,red sea woes exporters seek increased credit a...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
4,air india group to induct a plane every 6 days...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
5,q3 earnings results jsw steel pnb acc report m...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
6,blackstone’s fourthquarter earnings rise 4 as ...,3,3_india_rbi_profit_high,"[india, rbi, profit, high, inflation, demand, ...",[india to clock 65 gdp growth in fy24 despite ...,india - rbi - profit - high - inflation - dema...,0.969884,False
7,zomato gets rbi’s approval to operate as onlin...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
8,indiauk fta final push for deal as window clos...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False
9,govt close to saturation in implementing socia...,-1,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...,india - 2023 - rs - khan - says - new - 2024 -...,0.0,False


In [102]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7762,-1_india_2023_rs_khan,"[india, 2023, rs, khan, says, new, 2024, watch...",[dunki box office collection day 2 early repor...
1,0,1198,0_says_kapoor_watch_reveals,"[says, kapoor, watch, reveals, vs, khan, india...",[ranbir kapoor says he uses wife alia bhatt’s ...
2,1,413,1_technology___,"[technology, , , , , , , , , ]",[tech news today google pixel 7 available for ...
3,2,352,2_2023_2024_registration_check,"[2023, 2024, registration, check, neet, jee, e...",[neet pg 2023 counselling registrations for ro...
4,3,275,3_india_rbi_profit_high,"[india, rbi, profit, high, inflation, demand, ...",[india to clock 65 gdp growth in fy24 despite ...


In [79]:
model.get_topic_info(0)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,2632,0_says_vs_khan_watch,"[says, vs, khan, watch, kapoor, india, test, r...",[ind vs aus has david warner played his last t...


In [44]:
model.merge_topics(topic_model_df["headlines"], topics_to_merge=[[4, 5], [3, 6]])



100%|██████████| 6/6 [00:02<00:00,  2.28it/s]


In [45]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,220,-1_Rupee falls against US dollar trend in Indi...,-1_Rupee falls against US dollar in trading fl...,[Rupee falls against US dollar trend in India ...,[rupee falls 4 paise to 8332 against us dollar...
1,0,2084,0_Box office collections of Shah Rukh Khan's f...,Entertainment,[Box office collections of Shah Rukh Khan's fi...,[sam bahadur box office collection day 20 vick...
2,1,2005,1_neet ug 2023 counselling updates,Education,[neet ug 2023 counselling updates],[tamil nadu neet ug counselling 2023 registrat...
3,2,1970,2_India's Performance in Various Cricket Match...,Technology,[India's Performance in Various Cricket Matche...,[watch england awarded five penalty runs in wo...
4,3,1904,3_tech news today - new launches and updates f...,Business,[tech news today - new launches and updates fr...,[tech news today apple vision pro glass costs ...
5,4,1817,4_Quarterly Earnings Growth in Indian Banks an...,Sports,[Quarterly Earnings Growth in Indian Banks and...,[indusind bank q2 profit jumps 22 to rs 2202 c...


In [46]:
topic_labels = {0: "Entertainment", 1: "Education", 2: "Sports", 3: "Technology", 4: "Business"}
model.set_topic_labels(topic_labels)

In [47]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,220,-1_Rupee falls against US dollar trend in Indi...,-1_Rupee falls against US dollar in trading fl...,[Rupee falls against US dollar trend in India ...,[rupee falls 4 paise to 8332 against us dollar...
1,0,2084,0_Box office collections of Shah Rukh Khan's f...,Entertainment,[Box office collections of Shah Rukh Khan's fi...,[sam bahadur box office collection day 20 vick...
2,1,2005,1_neet ug 2023 counselling updates,Education,[neet ug 2023 counselling updates],[tamil nadu neet ug counselling 2023 registrat...
3,2,1970,2_India's Performance in Various Cricket Match...,Sports,[India's Performance in Various Cricket Matche...,[watch england awarded five penalty runs in wo...
4,3,1904,3_tech news today - new launches and updates f...,Technology,[tech news today - new launches and updates fr...,[tech news today apple vision pro glass costs ...
5,4,1817,4_Quarterly Earnings Growth in Indian Banks an...,Business,[Quarterly Earnings Growth in Indian Banks and...,[indusind bank q2 profit jumps 22 to rs 2202 c...


In [48]:
model.visualize_topics()


In [49]:
model.get_document_info(topic_model_df["headlines"]).head(5)

Unnamed: 0,Document,Topic,Name,CustomName,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,nirmala sitharaman to equal morarji desai’s re...,4,4_Quarterly Earnings Growth in Indian Banks an...,Business,[Quarterly Earnings Growth in Indian Banks and...,[indusind bank q2 profit jumps 22 to rs 2202 c...,Quarterly Earnings Growth in Indian Banks and ...,0.710833,False
1,‘will densify network want to be at least no 2...,-1,-1_Rupee falls against US dollar trend in Indi...,-1_Rupee falls against US dollar in trading fl...,[Rupee falls against US dollar trend in India ...,[rupee falls 4 paise to 8332 against us dollar...,Rupee falls against US dollar trend in India t...,0.0,False
2,air india group to induct an aircraft every si...,-1,-1_Rupee falls against US dollar trend in Indi...,-1_Rupee falls against US dollar in trading fl...,[Rupee falls against US dollar trend in India ...,[rupee falls 4 paise to 8332 against us dollar...,Rupee falls against US dollar trend in India t...,0.0,False
3,red sea woes exporters seek increased credit a...,4,4_Quarterly Earnings Growth in Indian Banks an...,Business,[Quarterly Earnings Growth in Indian Banks and...,[indusind bank q2 profit jumps 22 to rs 2202 c...,Quarterly Earnings Growth in Indian Banks and ...,0.63404,False
4,air india group to induct a plane every 6 days...,-1,-1_Rupee falls against US dollar trend in Indi...,-1_Rupee falls against US dollar in trading fl...,[Rupee falls against US dollar trend in India ...,[rupee falls 4 paise to 8332 against us dollar...,Rupee falls against US dollar trend in India t...,0.0,False


In [82]:
model.save("topic_model", serialization="safetensors", save_ctfidf=True)

In [50]:
def target(data):
    if data == 0:
        return "Entertainment"
    elif data == 1:
        return "Education"
    elif data == 2:
        return "Sports"
    elif data == 3:
        return "Technology"
    elif data == 4:
        return "Business"
    else:
        return "Unknown"


In [53]:
# New documents for prediction
new_documents = ["This is a new document about data science and machine learning.",
                 "Another document discussing the impact of AI on healthcare."]

# Predict topics for the new documents
topics, probs = model.transform(new_documents)

# Print predicted topics and their probabilities
for topic, prob in enumerate(zip(topics, probs)):
    print(f" predicted topic: {target(topic)} with probability: {prob}")


100%|██████████| 2/2 [00:00<00:00, 143.79it/s]
2024-10-15 19:34:34,081 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-10-15 19:34:34,758 - BERTopic - Dimensionality - Completed ✓
2024-10-15 19:34:34,758 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-10-15 19:34:34,760 - BERTopic - Cluster - Completed ✓


 predicted topic: 0 with probability: (1, 1.0)
 predicted topic: 1 with probability: (3, 1.0)


In [98]:
topic_info = model.get_topic_info()
print(topic_info)


   Topic  Count                                               Name  \
0     -1    261                   -1_rupee falls against US dollar   
1      0   2012  0_Bollywood Box Office Collection Reports - Sh...   
2      1   1997  1_Admission and Counselling Updates for Variou...   
3      2   1983  2_Latest Tech News Highlights - AI Innovations...   
4      3   1917  3_Indian companies' profit growth in Q1 and Q2...   
5      4   1830  4_India-Australia World Cup Matches and Live S...   

                         CustomName  \
0  -1_rupee falls against US dollar   
1                     Entertainment   
2                         Education   
3                        Technology   
4                          Business   
5                            Sports   

                                      Representation  \
0                    [rupee falls against US dollar]   
1  [Bollywood Box Office Collection Reports - Sha...   
2  [Admission and Counselling Updates for Various...   
3  [Latest 