Data Loading and initial preprocessing

In [1]:
import pandas as pd

# Load the dataset
file_path = '/content/newsdata.csv'
df = pd.read_csv(file_path)

# Preview the dataset
print(df.head())


          Date                                              Title  \
0  28 Sep 2024  Forvia Hella begins making RGB LED rear combin...   
1  28 Sep 2024  Continental named among top 500 sustainable co...   
2  28 Sep 2024  Stellantis honours 68 global and regional supp...   
3  27 Sep 2024  Schaeffler names new CFO ahead of expected mer...   
4  27 Sep 2024  China requests US to stop 'unreasonable suppre...   

                  Author                                               Tags  \
0  Autocar Pro News Desk  ['Lynk & Co', 'Geely Auto', 'Forvia Hella', 'R...   
1  Autocar Pro News Desk  ['Continental', 'Continental Tires', "world's ...   
2  Autocar Pro News Desk  ['Stellantis', 'Stellantis Supplier of the Yea...   
3  Autocar Pro News Desk             ['Vitesco Technologies', 'Schaeffler']   
4  Autocar Pro News Desk                                                 []   

   Related Organzation                                            Content  \
0                  NaN  The rear 

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initializing stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r'\W+', ' ', text)

    # Tokenize
    words = word_tokenize(text)

    # Removing stopwords and lemmatizing
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

df['Cleaned_Content'] = df['Content'].apply(preprocess_text)

print(df[['Content', 'Cleaned_Content']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                             Content  \
0  The rear lamp is used in the Geely Auto Group’...   
1  Continental ranked 265th in Time and Statista’...   
2  Fourth annual Stellantis Supplier of the Year ...   
3  Schaeffler has also announced the appointment ...   
4  "The U.S. move has no factual basis, violates ...   

                                     Cleaned_Content  
0  rear lamp used geely auto group new electric l...  
1  continental ranked 265th time statista list 50...  
2  fourth annual stellantis supplier year event l...  
3  schaeffler also announced appointment vitesco ...  
4  u move factual basis violates principle market...  


In [None]:
df

Unnamed: 0,Date,Title,Author,Tags,Content,Link,Cleaned_Content
0,04-Sep-24,New 425hp Nissan Patrol debuts in Abu Dhabi,Autocar Pro News Desk,[],"Seventh-generation sports bold new design, pow...",https://www.autocarpro.in/news-international/n...,seventh generation sport bold new design power...
1,03-Sep-24,BMW calling back Mini Cooper SE electric model...,Autocar Pro News Desk,['BMW'],The company stated that issues in battery syst...,https://www.autocarpro.in/news-international/b...,company stated issue battery system could resu...
2,03-Sep-24,Hyundai Mobis reveals three-pronged strategy t...,Autocar Pro News Desk,"['Hyundai Mobis', 'sustainable materials']",Next-generation core materials strategy to foc...,https://www.autocarpro.in/news-international/h...,next generation core material strategy focus s...
3,02-Sep-24,BYD reaches agreement to acquire Hedin Mobilit...,Autocar Pro News Desk,"['BYD', 'Hedin Mobility Group']",The transaction is subject to regulatory appro...,https://www.autocarpro.in/news-international/b...,transaction subject regulatory approval closin...
4,31-Aug-24,Volkswagen explores use of 100% bio-based leat...,Autocar Pro News Desk,['Revoltech GmbH'],Volkswagen brand partners with Revoltech GmbH ...,https://www.autocarpro.in/news-international/v...,volkswagen brand partner revoltech gmbh start ...
...,...,...,...,...,...,...,...
1003,19-Apr-23,Elektrobit to provide next-gen software archit...,Autocar Pro News Desk,"['Jaguar Land Rover', 'Elektrobit', 'EVA Conti...",Elektrobit will partner Jaguar Land Rover to b...,https://www.autocarpro.in/news-international/e...,elektrobit partner jaguar land rover build nex...
1004,19-Apr-23,UK’s HVS unveils 40-tonne hydrogen truck with ...,Autocar Pro News Desk,"['Hydrogen Vehicle Systems', 'Advanced Propuls...","Reinvention of commercial vehicle design, aime...",https://www.autocarpro.in/news-international/u...,reinvention commercial vehicle design aimed op...
1005,19-Apr-23,Nissan reveals Pathfinder concept SUV for Chin...,Autocar Pro News Desk,['Nissan Pathfinder'],"The Pathfinder Concept, like the all-electric ...",https://www.autocarpro.in/news-international/n...,pathfinder concept like electric arizon suv co...
1006,18-Apr-23,Faurecia showcases breakthrough hydrogen solut...,Autocar Pro News Desk,"['Symbio', 'Forvia', 'Faurecia', 'Faurecia Hyd...",While Containerised Hydrogen Solution can stor...,https://www.autocarpro.in/news-international/f...,containerised hydrogen solution store 1 ton us...


In [4]:
#keywords for each category
keywords = {
    'financial': ['bankruptcies', 'payment trends', 'quarterly reports', 'financial', 'loss'],
    'geopolitical': ['civil unrest', 'conflict', 'tariffs', 'embargo', 'labor disputes', 'geopolitical'],
    'natural_disasters': ['fires', 'earthquake', 'weather', 'flood', 'storm', 'natural disaster'],
    'regulatory': ['import restrictions', 'export restrictions', 'anti-bribery', 'corruption', 'material compliance', 'regulatory']
}

# Function to check if any keyword exists in the text
def contains_keyword(text, keywords):
    return any(keyword in text for keyword in keywords)

df['Category_Match'] = df['Cleaned_Content'].apply(lambda x: any(contains_keyword(x, kw_list) for kw_list in keywords.values()))

# Filtering the rows that match any category keyword
filtered_df = df[df['Category_Match']]

print(filtered_df[['Date', 'Title', 'Cleaned_Content']].head())


           Date                                              Title  \
5   27 Sep 2024  EDAG Group opens cutting-edge lighting laborat...   
10  26 Sep 2024  Antolin and PersiSkin develop plant-based alte...   
18  20 Sep 2024  European auto industry calls for urgent action...   
22  19 Sep 2024   EU car sales at three year low in August: Report   
38  12 Sep 2024  Pathbreaking ‘massless’ energy storage promise...   

                                      Cleaned_Content  
5   unique selling point new light laboratory desi...  
10  persiskin auto composed high content persimmon...  
18  european automobile manufacturer association c...  
22  data pointed fourth continuous monthly drop ev...  
38  world leading advance researcher chalmers univ...  


In [5]:
filtered_df

Unnamed: 0,Date,Title,Author,Tags,Related Organzation,Content,Link,Cleaned_Content,Category_Match
5,27 Sep 2024,EDAG Group opens cutting-edge lighting laborat...,Autocar Pro News Desk,"['EDAG Group', 'Zero Prototype Lab', 'light la...",,The unique selling point of the new light labo...,https://www.autocarpro.in/news-international/e...,unique selling point new light laboratory desi...,True
10,26 Sep 2024,Antolin and PersiSkin develop plant-based alte...,Autocar Pro News Desk,"['Antolin', 'PersiSkin Auto', 'leather alterna...",,PersiSkin Auto is composed of a high content o...,https://www.autocarpro.in/news-international/a...,persiskin auto composed high content persimmon...,True
18,20 Sep 2024,European auto industry calls for urgent action...,Autocar Pro News Desk,"['ACEA', 'BMW Group', 'Daimler Trucks', 'Toyot...",,The European Automobile Manufacturers’ Associa...,https://www.autocarpro.in/news-international/e...,european automobile manufacturer association c...,True
22,19 Sep 2024,EU car sales at three year low in August: Report,Autocar Pro News Desk,['European Union'],,The data pointed to the fourth continuous mont...,https://www.autocarpro.in/news-international/-...,data pointed fourth continuous monthly drop ev...,True
38,12 Sep 2024,Pathbreaking ‘massless’ energy storage promise...,Autocar Pro News Desk,"['Chalmers University of Technology', 'structu...",,World-leading advance by researchers at the Ch...,https://www.autocarpro.in/news-international/p...,world leading advance researcher chalmers univ...,True
...,...,...,...,...,...,...,...,...,...
1979,"Mar 08, 2023",Epicor acquires cloud-based financial planning...,,"['Planning', 'Manufacturing', 'Process']",,"Epicor has acquired DSPanel, a global provide...",https://www.logisticsit.com/articles/2023/03/0...,epicor acquired dspanel global provider cloud ...,True
1997,"Feb 22, 2023",Shifting geopolitical and economic dynamics pu...,,"['Supply Chain', 'Critical Issues', 'Retail']",,"By Craig Summers, Managing Director, Manhatta...",https://www.logisticsit.com/articles/2023/02/2...,craig summer managing director manhattan assoc...,True
2004,"Feb 20, 2023",Serious Waste boosts growth with BigChange Tech,,"['Transport Management', 'Process']",,"Serious, a nationwide sewage, grease and wast...",https://www.logisticsit.com/articles/2023/02/2...,serious nationwide sewage grease waste water m...,True
2005,"Feb 20, 2023",Make UK National Manufacturing Conference,,"['Exhibitions and Events', 'Manufacturing', 'P...",,"Make UK’s National Manufacturing Conference, ...",https://www.logisticsit.com/articles/2023/02/2...,make uk national manufacturing conference insp...,True


Tf-idf vectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)

tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_df['Cleaned_Content'])

print(tfidf_matrix.shape)


(448, 1000)


LSI

In [11]:
from sklearn.decomposition import TruncatedSVD

# Initialize TruncatedSVD (LSI)
lsi_model = TruncatedSVD(n_components=5, random_state=42)


lsi_topics = lsi_model.fit_transform(tfidf_matrix)

#the resulting topic vectors
print(lsi_topics[:5])

# the top terms per topic
terms = tfidf_vectorizer.get_feature_names_out()
for i, component in enumerate(lsi_model.components_):
    terms_in_topic = [terms[i] for i in component.argsort()[:-10 - 1:-1]]
    print(f"Topic {i}: {terms_in_topic}")


[[ 0.2944796  -0.21142586  0.00251383  0.14628768  0.04723579]
 [ 0.22083448 -0.1674205   0.04541624  0.05893141  0.03965006]
 [ 0.27944435 -0.33932766  0.17652719 -0.03765133  0.02977873]
 [ 0.19081682 -0.2643255   0.06691578 -0.08150451  0.02620036]
 [ 0.26134504 -0.33038798  0.11943915  0.11877009  0.01530619]]
Topic 0: ['chain', 'supply', 'business', 'data', 'company', 'customer', 'ai', 'technology', 'uk', 'system']
Topic 1: ['chain', 'supply', 'ai', 'data', 'disruption', 'planning', 'organisation', 'visibility', 'supplier', 'business']
Topic 2: ['chain', 'supply', 'vehicle', 'battery', 'electric', 'ev', 'disruption', 'supplier', 'global', 'truck']
Topic 3: ['ai', 'system', 'data', 'vehicle', 'solution', 'management', 'battery', 'technology', 'safety', 'driver']
Topic 4: ['retailer', 'customer', 'delivery', 'return', 'retail', 'consumer', 'brand', 'product', 'store', 'chain']


LDA

In [12]:
import gensim
from gensim import corpora

texts = [content.split() for content in filtered_df['Cleaned_Content']]

dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

# Applying LDA
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

# the top terms in each topic
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)


(0, '0.007*"new" + 0.006*"company" + 0.006*"vehicle" + 0.005*"customer" + 0.005*"market" + 0.005*"technology" + 0.005*"solution" + 0.005*"year" + 0.004*"business" + 0.004*"also"')
(1, '0.006*"business" + 0.006*"vehicle" + 0.006*"new" + 0.005*"battery" + 0.005*"year" + 0.005*"technology" + 0.005*"industry" + 0.004*"material" + 0.004*"also" + 0.004*"company"')
(2, '0.009*"system" + 0.007*"vehicle" + 0.005*"year" + 0.004*"data" + 0.004*"time" + 0.004*"also" + 0.004*"battery" + 0.004*"car" + 0.003*"new" + 0.003*"percent"')
(3, '0.013*"chain" + 0.013*"supply" + 0.010*"data" + 0.009*"business" + 0.007*"company" + 0.006*"ai" + 0.006*"time" + 0.006*"customer" + 0.006*"technology" + 0.005*"management"')
(4, '0.006*"business" + 0.005*"employee" + 0.005*"year" + 0.004*"container" + 0.004*"threat" + 0.004*"cost" + 0.004*"2023" + 0.004*"also" + 0.004*"uk" + 0.004*"attack"')


In [13]:
from gensim.models import CoherenceModel

# Compute Coherence Score for LDA
coherence_model_lda = CoherenceModel(model=lda_model, texts=[doc.split() for doc in filtered_df['Cleaned_Content']], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'LDA Coherence Score: {coherence_lda}')


LDA Coherence Score: 0.32297143283829327


In [None]:

!pip install bertopic sentence-transformers pandas


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 k

In [None]:
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Check current NLTK data paths
print("Current NLTK data paths:", nltk.data.path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Current NLTK data paths: ['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a single string
    return ' '.join(tokens)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load your dataset
# Assuming your dataset is in a CSV file named 'articles.csv'
df = pd.read_csv('/content/newsdata.csv')

# Extract the content column for topic modeling
docs1 = df['Content'].tolist()

# Define seed keywords
seed_keywords = [
    ["bankruptcy", "financial", "payment", "trend", "report"],  # Financial
    ["civil unrest", "geopolitical", "tariff", "conflict", "currency"],  # Geopolitical
    ["fire", "earthquake", "weather", "disaster"],  # Natural disasters
    ["import", "export", "corruption", "compliance", "regulation"]  # Regulatory
]


In [None]:
df

Unnamed: 0,Date,Title,Author,Tags,Related Organzation,Content,Link
0,28 Sep 2024,Forvia Hella begins making RGB LED rear combin...,Autocar Pro News Desk,"['Lynk & Co', 'Geely Auto', 'Forvia Hella', 'R...",,The rear lamp is used in the Geely Auto Group’...,https://www.autocarpro.in/news-international/f...
1,28 Sep 2024,Continental named among top 500 sustainable co...,Autocar Pro News Desk,"['Continental', 'Continental Tires', ""world's ...",,Continental ranked 265th in Time and Statista’...,https://www.autocarpro.in/news-international/c...
2,28 Sep 2024,Stellantis honours 68 global and regional supp...,Autocar Pro News Desk,"['Stellantis', 'Stellantis Supplier of the Yea...",,Fourth annual Stellantis Supplier of the Year ...,https://www.autocarpro.in/news-international/s...
3,27 Sep 2024,Schaeffler names new CFO ahead of expected mer...,Autocar Pro News Desk,"['Vitesco Technologies', 'Schaeffler']",,Schaeffler has also announced the appointment ...,https://www.autocarpro.in/news-international/s...
4,27 Sep 2024,China requests US to stop 'unreasonable suppre...,Autocar Pro News Desk,[],,"""The U.S. move has no factual basis, violates ...",https://www.autocarpro.in/news-international/c...
...,...,...,...,...,...,...,...
2003,"Feb 21, 2023",No more sleepless nights for IT Director at wh...,,"['Manufacturing', 'Process']",,moveero is a global manufacturer of wheels fo...,https://www.logisticsit.com/articles/2023/02/2...
2004,"Feb 20, 2023",Serious Waste boosts growth with BigChange Tech,,"['Transport Management', 'Process']",,"Serious, a nationwide sewage, grease and wast...",https://www.logisticsit.com/articles/2023/02/2...
2005,"Feb 20, 2023",Make UK National Manufacturing Conference,,"['Exhibitions and Events', 'Manufacturing', 'P...",,"Make UK’s National Manufacturing Conference, ...",https://www.logisticsit.com/articles/2023/02/2...
2006,"Feb 20, 2023",AR 'Magic Mirror' from Vyking launches for UK ...,,"['Datacapture', 'Retail']",,Virtual try-on startup Vyking.io has launched...,https://www.logisticsit.com/articles/2023/02/2...


In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')

# Continue with the rest of your code
lemmatizer = WordNetLemmatizer()

# Tokenize the text in the 'Content' column
df['Tokens'] = df['Content'].apply(word_tokenize)

print(df['Tokens'])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0       [The, rear, lamp, is, used, in, the, Geely, Au...
1       [Continental, ranked, 265th, in, Time, and, St...
2       [Fourth, annual, Stellantis, Supplier, of, the...
3       [Schaeffler, has, also, announced, the, appoin...
4       [``, The, U.S., move, has, no, factual, basis,...
                              ...                        
2003    [moveero, is, a, global, manufacturer, of, whe...
2004    [Serious, ,, a, nationwide, sewage, ,, grease,...
2005    [Make, UK, ’, s, National, Manufacturing, Conf...
2006    [Virtual, try-on, startup, Vyking.io, has, lau...
2007    [By, Robert, Lewis, ,, freelance, writer, ., W...
Name: Tokens, Length: 2008, dtype: object


In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Preprocessing the documents using defined function
docs1 = [preprocess_text(doc) for doc in docs1]

# Creating sentence embeddings using a pretrained Sentence Transformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs1, show_progress_bar=True)

# Creating a custom vectorizer with seed keywords
vectorizer_model = CountVectorizer(vocabulary=sum(seed_keywords, []))

# Creating the BERTopic model with the custom vectorizer
topic_model = BERTopic(vectorizer_model=vectorizer_model)

# Fitting the BERTopic model using the preprocessed documents and embeddings
topics, probs = topic_model.fit_transform(docs1, embeddings)

# Adding topics to the original dataframe
df['topic'] = topics

#top words for each topic
topic_info = topic_model.vectorizer_model.get_feature_names_out()
topics = topic_model.get_topics()

for topic_id, words in topics.items():
    # Sorting words by their scores in descending order and get the top 10 words
    top_words = [word for word, score in sorted(words, key=lambda item: item[1], reverse=True)[:10]]
    print(f"Topic {topic_id}: {', '.join(top_words)}")

# Saving the updated dataframe with topics
df.to_csv('articles_with_topics.csv', index=False)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

  idf = np.log((avg_nr_samples / df) + 1)


Topic -1: payment, trend, compliance, financial, regulation, geopolitical, report, export, fire, weather
Topic 0: fire, conflict, export, regulation, weather, financial, geopolitical, trend, report, compliance
Topic 1: currency, payment, trend, tariff, report, financial, regulation, weather, compliance, export
Topic 2: export, financial, weather, disaster, geopolitical, fire, import, regulation, report, trend
Topic 3: regulation, trend, report, compliance, geopolitical, financial, , , , 
Topic 4: tariff, export, import, earthquake, report, geopolitical, trend, regulation, disaster, fire
Topic 5: regulation, weather, financial, compliance, trend, report, , , , 
Topic 6: compliance, weather, fire, report, trend, financial, regulation, , , 
Topic 7: disaster, report, currency, trend, financial, compliance, conflict, geopolitical, regulation, payment
Topic 8: trend, regulation, compliance, report, weather, financial, , , , 
Topic 9: geopolitical, weather, import, regulation, conflict, expo

Intertopic distance map

In [None]:
import matplotlib.pyplot as plt

# Visualize the topics
fig = topic_model.visualize_topics()
fig.show()

# Save the visualization to a file
fig.write_html("topics_visualization.html")

Adjusting/Fine tuning the BertTopic model

In [None]:
 # Printing top words for each topic to evaluate topic quality
topic_info = topic_model.get_topic_info()
print(topic_info)  # Display the number of topics and their sizes

#adjusting the BERTopic model
topic_model = BERTopic(vectorizer_model=vectorizer_model, n_gram_range=(1, 2), min_topic_size=10)

# Re-fitting the model with adjust parameters
topics, probs = topic_model.fit_transform(docs1, embeddings)

# Re-check the topic quality after adjustments
print(topic_model.get_topic_info())


    Topic  Count                                       Name  \
0      -1    760      -1_payment_trend_compliance_financial   
1       0     92          0_fire_conflict_export_regulation   
2       1     80            1_currency_payment_trend_tariff   
3       2     68        2_export_financial_weather_disaster   
4       3     57       3_regulation_trend_report_compliance   
5       4     55          4_tariff_export_import_earthquake   
6       5     51  5_regulation_weather_financial_compliance   
7       6     49           6_compliance_weather_fire_report   
8       7     48           7_disaster_report_currency_trend   
9       8     47       8_trend_regulation_compliance_report   
10      9     46   9_geopolitical_weather_import_regulation   
11     10     43        10_report_disaster_compliance_trend   
12     11     42    11_export_report_geopolitical_financial   
13     12     41        12_import_export_regulation_weather   
14     13     38        13_payment_conflict_import_curr


divide by zero encountered in divide



    Topic  Count                                       Name  \
0      -1    762      -1_payment_trend_financial_compliance   
1       0    127       0_fire_financial_export_geopolitical   
2       1     82            1_currency_payment_trend_tariff   
3       2     60       2_regulation_trend_report_compliance   
4       3     57          3_trend_regulation_weather_report   
5       4     55         4_report_compliance_disaster_trend   
6       5     47  5_regulation_weather_financial_compliance   
7       6     47           6_disaster_report_currency_trend   
8       7     46          7_fire_conflict_export_regulation   
9       8     46           8_compliance_weather_fire_report   
10      9     44     9_export_report_geopolitical_financial   
11     10     37        10_import_export_regulation_weather   
12     11     34     11_import_compliance_weather_financial   
13     12     33        12_payment_conflict_currency_import   
14     13     31          13_tariff_export_financial_re

Defining class labels

In [None]:
# Define the refined topic labels
topic_labels = {
    -1: "Finance,Geopolitical, Natural Disasters, Regulatory",
    0: "Finance, Regulatory, Geopolitical",
    1: "Natural Disasters, Geopolitical, Regulatory",
    2: "Finance, Geopolitical, Regulatory",
    3: "Finance, Natural Disasters, Geopolitical",
    4: "Geopolitical, Natural Disasters, Regulatory",
    5: "Regulatory, Finance, Natural Disasters",
    6: "Natural Disasters, Finance, Regulatory",
    7: "Natural Disasters, Finance, Geopolitical, Regulatory",
    8: "Regulatory, Finance, Natural Disasters",
    9: "Geopolitical, Natural Disasters, Regulatory",
    10: "Natural Disasters, Geopolitical, Finance, Regulatory",
    11: "Geopolitical, Finance, Regulatory",
    12: "Regulatory, Geopolitical, Natural Disasters",
    13: "Geopolitical, Finance, Regulatory, Natural Disasters",
    14: "Finance, Geopolitical, Regulatory",
    15: "Natural Disasters, Finance, Regulatory",
    16: "Regulatory, Geopolitical, Finance, Natural Disasters",
    17: "Geopolitical, Natural Disasters, Regulatory",
    18: "Geopolitical, Finance",
    19: "Regulatory, Finance, Natural Disasters",
    20: "Geopolitical, Finance, Regulatory",
    21: "Natural Disasters, Regulatory",
    22: "Finance, Regulatory, Natural Disasters",
    23: "Finance",
    24: "Natural Disasters, Regulatory, Finance",
    25: "Natural Disasters, Geopolitical, Regulatory",
    26: "Regulatory, Geopolitical",
    27: "Geopolitical, Regulatory, Natural Disasters",
    28: "Geopolitical, Finance",
    29: "Finance",
    30: "Finance, Geopolitical, Regulatory",
    31: "Natural Disasters, Finance",
    32: "Natural Disasters",
    33: "Finance",
    34: "Geopolitical, Regulatory",
    35: "Natural Disasters, Finance"
}

# Applying the refined labels to dataframe
df['topic_label'] = df['topic'].map(topic_labels)

# Printing the relevant columns
print(df[['Content', 'topic', 'topic_label']])


                                                Content  topic  \
0     The rear lamp is used in the Geely Auto Group’...     35   
1     Continental ranked 265th in Time and Statista’...     27   
2     Fourth annual Stellantis Supplier of the Year ...     29   
3     Schaeffler has also announced the appointment ...     28   
4     "The U.S. move has no factual basis, violates ...     32   
...                                                 ...    ...   
2003   moveero is a global manufacturer of wheels fo...      7   
2004   Serious, a nationwide sewage, grease and wast...     -1   
2005   Make UK’s National Manufacturing Conference, ...     11   
2006   Virtual try-on startup Vyking.io has launched...      1   
2007   By Robert Lewis, freelance writer. When it co...     -1   

                                            topic_label  
0                            Natural Disasters, Finance  
1           Geopolitical, Regulatory, Natural Disasters  
2                                

In [None]:
df['probability'] = probs  # Adds probability scores for each topic
print(df[['Content', 'topic', 'topic_label', 'probability']])

                                                Content  topic  \
0     The rear lamp is used in the Geely Auto Group’...     35   
1     Continental ranked 265th in Time and Statista’...     27   
2     Fourth annual Stellantis Supplier of the Year ...     29   
3     Schaeffler has also announced the appointment ...     28   
4     "The U.S. move has no factual basis, violates ...     32   
...                                                 ...    ...   
2003   moveero is a global manufacturer of wheels fo...      7   
2004   Serious, a nationwide sewage, grease and wast...     -1   
2005   Make UK’s National Manufacturing Conference, ...     11   
2006   Virtual try-on startup Vyking.io has launched...      1   
2007   By Robert Lewis, freelance writer. When it co...     -1   

                                            topic_label  probability  
0                            Natural Disasters, Finance     1.000000  
1           Geopolitical, Regulatory, Natural Disasters     1.000

In [None]:
df.to_csv('classified_documents_with_topics.csv', index=False)


App

In [None]:
import pandas as pd

# Loading dataset
file_path = '/content/classified_documents_with_topics.csv'
data = pd.read_csv(file_path)

# Displaying the first few rows
print(data.head())
# Check for missing values and data types
print(data.info())
print(data['topic_label'].value_counts())  # Assuming 'Label' column for category labels


          Date                                              Title  \
0  28 Sep 2024  Forvia Hella begins making RGB LED rear combin...   
1  28 Sep 2024  Continental named among top 500 sustainable co...   
2  28 Sep 2024  Stellantis honours 68 global and regional supp...   
3  27 Sep 2024  Schaeffler names new CFO ahead of expected mer...   
4  27 Sep 2024  China requests US to stop 'unreasonable suppre...   

                  Author                                               Tags  \
0  Autocar Pro News Desk  ['Lynk & Co', 'Geely Auto', 'Forvia Hella', 'R...   
1  Autocar Pro News Desk  ['Continental', 'Continental Tires', "world's ...   
2  Autocar Pro News Desk  ['Stellantis', 'Stellantis Supplier of the Yea...   
3  Autocar Pro News Desk             ['Vitesco Technologies', 'Schaeffler']   
4  Autocar Pro News Desk                                                 []   

   Related Organzation                                            Content  \
0                  NaN  The rear 

In [None]:
import pandas as pd

# Load dataset
file_path = '/content/classified_documents_with_topics.csv'
data = pd.read_csv(file_path)

# Define categories and create binary columns for multi-label classification
categories = ["Finance", "Geopolitical", "Natural Disasters", "Regulatory"]

# Split labels and create binary columns
for category in categories:
    data[category] = data['topic_label'].apply(lambda x: int(category in x))

# Verify binary columns
print(data[categories].head())


   Finance  Geopolitical  Natural Disasters  Regulatory
0        1             0                  1           0
1        0             1                  1           1
2        1             0                  0           0
3        1             1                  0           0
4        0             0                  1           0


In [None]:
import re

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Basic text preprocessing
data['Content'] = data['Content'].apply(lambda x: re.sub(r'\W', ' ', str(x)).lower())

# Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(
    data['Content'], data[categories], test_size=0.2, random_state=42
)

# TF-IDF feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Use class_weight='balanced' to adjust for imbalance
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
model.fit(X_train_tfidf, y_train)
# Make predictions
y_pred = model.predict(X_val_tfidf)

# Evaluation
print(classification_report(y_val, y_pred, target_names=categories))


                   precision    recall  f1-score   support

          Finance       0.95      0.88      0.91       333
     Geopolitical       0.95      0.88      0.92       330
Natural Disasters       0.94      0.93      0.93       329
       Regulatory       0.98      0.96      0.97       366

        micro avg       0.95      0.92      0.93      1358
        macro avg       0.95      0.91      0.93      1358
     weighted avg       0.95      0.92      0.93      1358
      samples avg       0.96      0.92      0.93      1358



In [None]:
from sklearn.metrics import hamming_loss, f1_score

# Calculate Hamming loss
print("Hamming Loss:", hamming_loss(y_val, y_pred))

# Calculate F1 score
print("F1 Score (micro):", f1_score(y_val, y_pred, average='micro'))
print("F1 Score (macro):", f1_score(y_val, y_pred, average='macro'))


Hamming Loss: 0.10758706467661691
F1 Score (micro): 0.9349868470499813
F1 Score (macro): 0.9337978092855136
