## Imports

In [2]:
from bertopic import BERTopic
import pandas as pd
import re
import json
from sentence_transformers import SentenceTransformer
import torch
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from top2vec import Top2Vec

## Import Data

In [2]:
combined_df = pd.read_csv("data/csv/combined_terms.csv")

combined_df.head()

Unnamed: 0,date,topic,text,term
0,07-05-1999,1. VOTES,President . – We shall now move on to the vote...,94_99
1,07-05-1999,2. Potato starch,President . – The next item is the proposal fo...,94_99
2,07-05-1999,3. Hannover 2000,President . – The next item is the report (A4‐...,94_99
3,07-05-1999,4. European textiles market,President . – The next item is the joint debat...,94_99
4,06-05-1999,1. Approval of the Minutes,President . – The Minutes of yesterday's sitti...,94_99


In [3]:
print(combined_df.shape)

(23408, 4)


## BERTopic Model

In [None]:
# for CPU usage
# bertopic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")

In [21]:
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Laptop GPU


In [22]:
if torch.cuda.is_available():
    # run with GPU
    print("Using GPU")
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")
    bertopic_model = BERTopic(embedding_model=embedding_model)
else:
    # run with CPU
    print("Using CPU")
    bertopic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")


Using GPU


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [5]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words = ENGLISH_STOP_WORDS

In [9]:
with open('data/custom_stopwords.json', 'r', encoding='utf-8') as f:
    custom_stopwords = set(json.load(f))

In [11]:
print(list(custom_stopwords)[:10])

['decision', 'proposal', 'joint', 'nominee', 'honourable', 'floor', 'question', 'appendices', 'subject', 'vicepresident']


In [12]:
# Combine with sklearn stopwords
stop_words = ENGLISH_STOP_WORDS.union(custom_stopwords)

In [13]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation, numbers, etc.
    text = re.sub(r"[^a-z\s]", "", text)
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [14]:
df = combined_df.copy()

In [15]:
df["clean_text"] = df["text"].apply(clean_text)
docs = df["clean_text"].tolist()

In [16]:
df["clean_text"].head()

0    shall votes regulation ecsc ec euratom incorpo...
1    regulation amending regulation ec establishing...
2    hoppenstedt culture youth education media comm...
3    following b ferrer peijs chanterie group peopl...
4    distributed spencer ppe chairman foreign affai...
Name: clean_text, dtype: object

In [18]:
for idx in range(10):
    text = df.iloc[idx]["clean_text"]
    word_count = len(str(text).split())
    print(f"Row {idx}: {word_count} words")

Row 0: 514 words
Row 1: 562 words
Row 2: 1201 words
Row 3: 2649 words
Row 4: 480 words
Row 5: 2324 words
Row 6: 3796 words
Row 7: 6870 words
Row 8: 8430 words
Row 9: 111 words


This might be an issue for BERTopic as it can only handle 512 tokens, so some information will be lost as the text will get truncated.

In [23]:
topics, probs = bertopic_model.fit_transform(docs)

In [30]:
topic_info = bertopic_model.get_topic_info()

topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6917,-1_europe_states_eu_countries,"[europe, states, eu, countries, new, citizens,...",[outcome december presidentinoffice activity f...
1,0,488,0_fishing_fisheries_fish_fishermen,"[fishing, fisheries, fish, fishermen, stocks, ...",[following reports stevenson fisheries communi...
2,1,409,1_women_gender_equality_violence,"[women, gender, equality, violence, men, women...",[explanations voting michaela ojdrov ppe consi...
3,2,329,2_budget_budgets_budgetary_appropriations,"[budget, budgets, budgetary, appropriations, e...",[following reports budgets costa neves draft g...
4,3,297,3_la_di_le_il,"[la, di, le, il, che, et, que, en, die, und]",[presidente lordine del giorno reca discussion...
...,...,...,...,...,...
334,333,11,333_reads_objections_human_cautions,"[reads, objections, human, cautions, rights, b...",[violations human rights democracy rule law ec...
335,334,11,334_budget_refugee_million_flexibility,"[budget, refugee, million, flexibility, migrat...",[voting isabella adinolfi efdd writing followi...
336,335,11,335_verbal_distributed_approved_observations,"[verbal, distributed, approved, observations, ...",[verbal process approved texts distributed obs...
337,336,10,336_disease_vaccination_footandmouth_outbreak,"[disease, vaccination, footandmouth, outbreak,...",[following b van dam edd group footandmouth di...


In [25]:
# Add topic assignments to dataframe
df["bertopic_topic"] = topics
df["bertopic_prob"] = probs

In [27]:
df.head()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob
0,07-05-1999,1. VOTES,President . – We shall now move on to the vote...,94_99,shall votes regulation ecsc ec euratom incorpo...,11,0.155931
1,07-05-1999,2. Potato starch,President . – The next item is the proposal fo...,94_99,regulation amending regulation ec establishing...,4,0.273608
2,07-05-1999,3. Hannover 2000,President . – The next item is the report (A4‐...,94_99,hoppenstedt culture youth education media comm...,39,0.194263
3,07-05-1999,4. European textiles market,President . – The next item is the joint debat...,94_99,following b ferrer peijs chanterie group peopl...,235,1.0
4,06-05-1999,1. Approval of the Minutes,President . – The Minutes of yesterday's sitti...,94_99,distributed spencer ppe chairman foreign affai...,-1,0.0


In [26]:
for topic_num in bertopic_model.get_topics().keys():
    if topic_num == -1:  # skip outlier topic
        continue
    print(f"\nTopic {topic_num}")
    print([word for word, _ in bertopic_model.get_topic(topic_num)])


Topic 0
['fishing', 'fisheries', 'fish', 'fishermen', 'stocks', 'sea', 'tuna', 'waters', 'marine', 'species']

Topic 1
['women', 'gender', 'equality', 'violence', 'men', 'womens', 'equal', 'girls', 'sexual', 'female']

Topic 2
['budget', 'budgets', 'budgetary', 'appropriations', 'expenditure', 'eur', 'financial', 'year', 'funds', 'billion']

Topic 3
['la', 'di', 'le', 'il', 'che', 'et', 'que', 'en', 'die', 'und']

Topic 4
['farmers', 'agricultural', 'agriculture', 'rural', 'milk', 'production', 'cap', 'producers', 'food', 'dairy']

Topic 5
['energy', 'renewable', 'gas', 'electricity', 'efficiency', 'sources', 'supply', 'market', 'prices', 'energies']

Topic 6
['delegation', 'gallery', 'interparliamentary', 'pleasure', 'warm', 'visit', 'strasbourg', 'fruitful', 'led', 'wish']

Topic 7
['que', 'la', 'en', 'el', 'los', 'para', 'las', 'por', 'se', 'una']

Topic 8
['la', 'et', 'les', 'le', 'des', 'en', 'que', 'nous', 'pour', 'qui']

Topic 9
['social', 'employment', 'workers', 'labour', 'un

In [13]:
# import keywords file to see which topics could be transport related
with open('data/keywords_full.json', 'r', encoding='utf-8') as f:
    keywords_full = set(json.load(f))

In [36]:
# Find transport-related topics
def is_transport_topic(topic_id, min_keywords=3):
    if topic_id == -1:  # Skip outlier topic
        return False
    
    # Get topic words
    topic_words = bertopic_model.get_topic(topic_id)
    words = ' '.join([word.lower() for word, score in topic_words])
    
    # Count matching keywords
    match_count = sum(1 for keyword in keywords_full if keyword in words)
    
    return match_count >= min_keywords

# Identify transport topics
transport_topic_ids = []
for topic_id in topic_info['Topic']:
    if is_transport_topic(topic_id):
        transport_topic_ids.append(topic_id)
        topic_words = bertopic_model.get_topic(topic_id)
        print(f"Topic {topic_id}: {[word for word, score in topic_words[:10]]}")

print(f"\nFound {len(transport_topic_ids)} transport-related topics: {transport_topic_ids}")



Topic 21: ['air', 'aviation', 'airlines', 'passengers', 'airports', 'aircraft', 'safety', 'airport', 'flight', 'transport']
Topic 36: ['road', 'vehicles', 'drivers', 'ecall', 'safety', 'traffic', 'roads', 'driving', 'transport', 'vehicle']
Topic 64: ['transport', 'infrastructure', 'mobility', 'traffic', 'transeuropean', 'projects', 'networks', 'road', 'network', 'multimodal']
Topic 69: ['railway', 'rail', 'transport', 'railways', 'freight', 'passenger', 'interoperability', 'package', 'train', 'passengers']

Found 4 transport-related topics: [21, 36, 64, 69]


In [37]:
df["is_bertopic"] = df["bertopic_topic"].isin(transport_topic_ids)

print(f"\nRows flagged as transport by BERTopic: {df['is_bertopic'].sum()}")


Rows flagged as transport by BERTopic: 371


In [40]:
df[df["is_bertopic"] == True].head()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic
36,04-05-1999,11. Charging of heavy goods vehicles,President . – The next item is the recommendat...,94_99,recommendation transport tourism common positi...,36,0.650979,True
63,15-04-1999,4. Transport infrastructure charging,President . – The next item is the report (A4‐...,94_99,schmidbauer transport tourism white paper enti...,64,1.0,True
65,15-04-1999,6. Transport infrastructure charging (continua...,President . – The next item is the continuatio...,94_99,continuation schmidbauer transport tourism whi...,64,0.914965,True
70,15-04-1999,11. TOPICAL AND URGENT DEBATE,President . – The next item is the debate on t...,94_99,topical urgent subjects major importance follo...,36,0.995196,True
156,09-03-1999,14. Transportable pressure equipment,President . – The next item is the recommendat...,94_99,recommendation transport tourism common positi...,36,0.658336,True


### Additional Analysis

In [41]:
bertopic_model.get_representative_docs(21)

 'reasons voting stanislav polk ppe thank granting word previous point consider important explain votei supported proposed regard fact lost thousand jobs years airlinesand think significantly raised finger feel air transportin opinion necessary deal increase competitiveness air industry suggestsbut liked example mentions elimination unnecessary burden security measures unnecessary administrative measures proposed measuresfor reasons voted message grateful gave opportunity explain written reasons voting william earl dartmouth efdd writing ukip voted wishing interfere function countries determining competitiveness industry calling input efficient running airports uk efficient airport operations world furthermore looking increasing responsibilities easa necessity caa regulatory responsibilities passed easa responsibilities taken away marina albiol guzmn guengl writing voted raises privatization does offer solutions main problems packed sectorwith exclusive approach competitiveness interna

In [42]:
bertopic_model.visualize_topics()

Based on this visualization we can notice even more topics potentially related to transport, based on their proximity compared to the transport related topics we already detected, therefore they should probably be counted as well. This brings us to an expanded topic list: 21, 27, 36, 64, 69, 227, 315, 329

In [49]:
bertopic_model.visualize_barchart(topics=[21, 27, 36, 64, 69, 227, 315, 329], n_words=7)

In [50]:
print(transport_topic_ids)

[21, 36, 64, 69]


In [51]:
# add more topics based on intertopic distance map
transport_topic_ids = [21, 27, 36, 64, 69, 227, 315, 329]

print(transport_topic_ids)

[21, 27, 36, 64, 69, 227, 315, 329]


In [52]:
# update is_bertopic column
df["is_bertopic"] = df["bertopic_topic"].isin(transport_topic_ids)

print(f"\nRows flagged as transport by BERTopic: {df['is_bertopic'].sum()}")


Rows flagged as transport by BERTopic: 536


In [54]:
df[df["is_bertopic"] == True].tail()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic
22926,12-03-2025,4. Action Plan for the Automotive Industry (de...,President. – The next item on the agenda is th...,24_25,action plan automotive industry rsp adam szapk...,27,0.596925,True
23114,17-12-2024,18. Towards a shared vision for European touri...,Chairman.- The next item on the agenda is the ...,24_25,chairman commissions announcement common visio...,227,1.0,True
23242,23-10-2024,18. Need to strengthen rail travel and the rai...,President.- The agenda is questioned by the Co...,24_25,questioned commissions theme need strengthen r...,69,0.315742,True
23270,21-10-2024,14. Implementation of the Single European Sky ...,Chairwoman.- Another point of the daytime is t...,24_25,chairwoman point daytime recommendation drawn ...,21,0.87858,True
23307,08-10-2024,9. The crisis facing the EU’s automotive indus...,President. – The next item on the agenda is th...,24_25,crisis facing eus automotive industry potentia...,27,0.596925,True


In [None]:
# save updated dataframe with betrtopic results
# if needed as checkpoint
# df.to_csv("data/csv/combined_terms_with_bertopic.csv", index=False)

## LDA Model

In [None]:
# load previously saved dataframe with bertopic results if needed
# df = pd.read_csv("data/csv/combined_terms_with_bertopic.csv")

In [57]:
# shared functions for LDA and NMF

def display_topics(model, feature_names, n_top_words=10):
    """Display top words for each topic"""
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")

def assign_transport_topics(df, topic_col, transport_topic_ids, flag_col):
    """Flag documents belonging to transport topics"""
    df[flag_col] = df[topic_col].isin(transport_topic_ids)
    print(f"Transport documents: {df[flag_col].sum()}")
    return df

In [74]:
# using the same cleaned text from the previous model
count_vectorizer = CountVectorizer(max_df=0.90, min_df=5, max_features=5000, stop_words=list(stop_words))
count_data = count_vectorizer.fit_transform(docs)

In [75]:
# for number of topics we need to put a higher number than usual
# since the dataset is quite large and bertopic found more than 300 topics
n_topics = 100

In [76]:
# Train LDA
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42, n_jobs=-1)
lda_topics = lda_model.fit_transform(count_data)

In [77]:
print("=== LDA Topics ===")
display_topics(lda_model, count_vectorizer.get_feature_names_out(), n_top_words=10)

=== LDA Topics ===
Topic 0: vehicles, tunisia, oil, vehicle, car, cars, olive, motor, tunisian, road
Topic 1: ireland, irish, uk, northern, british, government, kingdom, people, peace, brexit
Topic 2: treaty, europe, citizens, political, national, lisbon, states, constitution, parliaments, democratic
Topic 3: people, employment, young, unemployment, social, jobs, youth, states, europe, work
Topic 4: regulation, writing, voted, ppe, rules, legislation, new, legal, products, sd
Topic 5: data, protection, states, personal, citizens, information, security, privacy, united, agreement
Topic 6: social, economic, employment, growth, policies, policy, strategy, states, poverty, market
Topic 7: states, legislation, legal, animals, animal, need, rules, law, important, issue
Topic 8: market, single, internal, companies, services, economy, competition, new, important, consumers
Topic 9: presidency, countries, states, presidentinoffice, europe, policy, new, enlargement, summit, time
Topic 10: die, u

In [82]:
df["lda_topic"] = lda_topics.argmax(axis=1)
df["lda_prob"] = lda_topics.max(axis=1)

In [83]:
df.head()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic,lda_topic,lda_prob
0,07-05-1999,1. VOTES,President . – We shall now move on to the vote...,94_99,shall votes regulation ecsc ec euratom incorpo...,11,0.155931,False,85,0.246812
1,07-05-1999,2. Potato starch,President . – The next item is the proposal fo...,94_99,regulation amending regulation ec establishing...,4,0.273608,False,17,0.404205
2,07-05-1999,3. Hannover 2000,President . – The next item is the report (A4‐...,94_99,hoppenstedt culture youth education media comm...,39,0.194263,False,62,0.431773
3,07-05-1999,4. European textiles market,President . – The next item is the joint debat...,94_99,following b ferrer peijs chanterie group peopl...,235,1.0,False,85,0.289002
4,06-05-1999,1. Approval of the Minutes,President . – The Minutes of yesterday's sitti...,94_99,distributed spencer ppe chairman foreign affai...,-1,0.0,False,85,0.670585


In [84]:
# transport topics identified after manual review
lda_transport_topics = [0, 19, 72, 90, 93]
df = assign_transport_topics(df, "lda_topic", lda_transport_topics, "is_lda")

Transport documents: 442


In [85]:
df[df["is_lda"] == True].tail()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic,lda_topic,lda_prob,is_lda
22866,02-04-2025,13. The importance of trans-European transport...,President.- The agenda bears the declarations ...,24_25,bears importance transe transport infrastructu...,-1,0.0,False,19,0.252174,True
23114,17-12-2024,18. Towards a shared vision for European touri...,Chairman.- The next item on the agenda is the ...,24_25,chairman commissions announcement common visio...,227,1.0,True,93,0.210438,True
23220,24-10-2024,12. Protecting our oceans: persistent threats ...,President.- The agenda questioned the declarat...,24_25,questioned protection oceans persistent threat...,0,0.43822,False,90,0.280474,True
23242,23-10-2024,18. Need to strengthen rail travel and the rai...,President.- The agenda is questioned by the Co...,24_25,questioned commissions theme need strengthen r...,69,0.315742,True,19,0.36362,True
23270,21-10-2024,14. Implementation of the Single European Sky ...,Chairwoman.- Another point of the daytime is t...,24_25,chairwoman point daytime recommendation drawn ...,21,0.87858,True,72,0.336446,True


In [None]:
# save updated dataframe with LDA results (and bertopic results if they were already present)
# if needed as checkpoint
# df.to_csv("data/csv/combined_terms_with_lda.csv", index=False)

## NMF Model

In [None]:
# load some of the previously saved dataframes if needed
# df = pd.read_csv("data/csv/combined_terms_with_bertopic.csv")
# df = pd.read_csv("data/csv/combined_terms_with_lda.csv")

In [78]:
# Vectorize with TF-IDF instead of CountVectorizer for NMF
# same hypermarameter settings as for CountVectorizer and LDA
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=5, max_features=5000, stop_words=list(stop_words))
tfidf_data = tfidf_vectorizer.fit_transform(docs)

In [79]:
# same number of topics as LDA for comparison
n_topics = 100

In [80]:
# Train NMF
nmf_model = NMF(n_components=n_topics, random_state=42, init='nndsvda')
nmf_topics = nmf_model.fit_transform(tfidf_data)

In [81]:
print("=== NMF Topics ===")
display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out(), n_top_words=10)

=== NMF Topics ===
Topic 0: europe, people, citizens, today, world, new, future, years, political, democracy
Topic 1: que, la, en, los, el, las, para, por, una, se
Topic 2: details, results, concerning, time, proceed, follows, shall, point, bears, shift
Topic 3: pm, explain, open, period, explanations, dos, martnez, opens, night, coordination
Topic 4: elections, political, country, democratic, government, democracy, election, opposition, venezuela, situation
Topic 5: die, der, und, ist, wir, das, ich, den, zu, prsident
Topic 6: discharge, agency, budget, execution, financial, auditors, agencies, accounts, year, control
Topic 7: conference, presidents, thursday, wednesday, group, partsession, tuesday, tomorrow, afternoon, groups
Topic 8: writing, voted, ppe, favor, eu, sd, favour, guengl, supported, pt
Topic 9: suspended, moments, deputy, pedro, martnez, noon, sylvie, roberta, pending, karas
Topic 10: fishing, fish, fishermen, tuna, species, sea, waters, vessels, stocks, sustainable
Top

In [87]:
df["nmf_topic"] = nmf_topics.argmax(axis=1)
df["nmf_prob"] = nmf_topics.max(axis=1)

In [88]:
# transport topics identified after manual review
nmf_transport_topics = [22, 53, 76]
df = assign_transport_topics(df, "nmf_topic", nmf_transport_topics, "is_nmf")

Transport documents: 592


In [89]:
df[df["is_nmf"] == True].tail()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic,lda_topic,lda_prob,is_lda,nmf_topic,nmf_prob,is_nmf
22810,06-05-2025,6.1. CO2 emission performance standards for ne...,The president.-The first vote relates to perfo...,24_25,presidentthe relates performance standards emi...,27,1.0,True,0,0.398724,True,53,0.03123,True
22864,02-04-2025,11. European oceans pact (debate),Chairman.- Another point of the program is the...,24_25,chairman point program ocean pact rsp adam sza...,0,0.41764,False,90,0.287918,True,76,0.039847,True
22866,02-04-2025,13. The importance of trans-European transport...,President.- The agenda bears the declarations ...,24_25,bears importance transe transport infrastructu...,-1,0.0,False,19,0.252174,True,22,0.063462,True
23242,23-10-2024,18. Need to strengthen rail travel and the rai...,President.- The agenda is questioned by the Co...,24_25,questioned commissions theme need strengthen r...,69,0.315742,True,19,0.36362,True,22,0.067017,True
23270,21-10-2024,14. Implementation of the Single European Sky ...,Chairwoman.- Another point of the daytime is t...,24_25,chairwoman point daytime recommendation drawn ...,21,0.87858,True,72,0.336446,True,22,0.026686,True


In [90]:
# save updated dataframe with NMF results (and bertopic and/or LDA results if they were already present)
# if needed as checkpoint
df.to_csv("data/csv/combined_terms_with_nmf.csv", index=False)

## Top2Vec Model

In [None]:
# load some of the previously saved dataframes if needed
# df = pd.read_csv("data/csv/combined_terms_with_bertopic.csv")
# df = pd.read_csv("data/csv/combined_terms_with_lda.csv")
# df = pd.read_csv("data/csv/combined_terms_with_nmf.csv")

In [None]:
# again use same docs as before
# docs = df["clean_text"].tolist()

In [6]:
# train model

top2vec_model = Top2Vec(
    documents=docs,
    speed="learn",
    workers=-1,
    verbose=True
)

2025-10-18 14:31:24,014 - top2vec - INFO - Pre-processing documents for training
2025-10-18 14:33:16,414 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-10-18 14:33:21,144 - top2vec - INFO - Creating joint document/word embedding
2025-10-18 14:35:11,004 - top2vec - INFO - Creating lower dimension embedding of documents
2025-10-18 14:35:35,960 - top2vec - INFO - Finding dense areas of documents
2025-10-18 14:35:47,131 - top2vec - INFO - Finding topics


In [7]:
print(f"Number of topics found: {top2vec_model.get_num_topics()}")

Number of topics found: 227


In [8]:
topic_sizes, topic_nums = top2vec_model.get_topic_sizes()

In [12]:
topic_words, word_scores, topic_nums = top2vec_model.get_topics()

print("\n=== Top2Vec Topics ===")
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(f"Topic {num}: {', '.join(words[:10])}")


=== Top2Vec Topics ===
Topic 0: votethis, votes, committeethis, voting, electionsi, parliamentthis, deliberation, ballots, chairmanthis, voteit
Topic 1: budgetas, budgeti, budgetisation, budgetary, reformsi, budgetand, managementi, agenciesi, committees, budgeting
Topic 2: legislations, legislationthis, legislationi, parliaments, committees, legislationit, parliamentary, legislative, legislationthe, legislation
Topic 3: reformsthe, legislations, compliance, enforcement, enforces, reformsi, securityi, legislationthis, reforms, safeguards
Topic 4: fisheries, fishery, fishingthe, fishingthis, fishing, fishermens, fishermen, fishingi, fishingit, fished
Topic 5: feminista, feminism, womenit, womeni, suffrage, genders, feminists, womens, discriminationi, genderrelated
Topic 6: agricultures, agriculture, agricultural, agricultureit, agriculturethe, farmers, crops, farms, agrarian, farmersthe
Topic 7: wordings, translations, translating, amendements, rewording, agreementas, translation, agree

In [15]:
# assign topics to documents
topic_nums_assigned, topic_scores, topics_words, word_scores = top2vec_model.get_documents_topics(list(range(len(docs))))


In [16]:
df["top2vec_topic"] = topic_nums_assigned
df["top2vec_score"] = topic_scores

In [None]:
top2vec_transport_topics = []

# too many topics for manual inspection, use keyword matching instead
print("\n=== Top2Vec Transport Topics ===")
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    words_str = ' '.join([word.lower() for word in words])
    match_count = sum(1 for keyword in keywords_full if keyword in words_str)
    
    if match_count >= 3: # try out different thresholds
        top2vec_transport_topics.append(num)
        print(f"Topic {num}: {', '.join(words[:10])}")

print(f"\nFound {len(top2vec_transport_topics)} transport-related topics: {top2vec_transport_topics}")


=== Top2Vec Transport Topics ===
Topic 15: transportation, transportthis, transporti, transport, transportthe, motorways, transporters, transportul, highways, transporter
Topic 41: airlines, airline, aviation, airports, passengers, regulationsi, flights, regulationsthis, regulationi, airport
Topic 42: maritimes, maritime, tankers, seaports, seafarers, ferries, tanker, shipbuilding, harbours, ships
Topic 100: railways, railway, rail, railroad, trains, locomotives, transporti, transportation, train, transportthis

Found 4 transport-related topics: [15, 41, 42, 100]


In [21]:
df["is_top2vec"] = df["top2vec_topic"].isin(top2vec_transport_topics)

In [22]:
print(f"\nRows flagged as transport by Top2Vec: {df['is_top2vec'].sum()}")


Rows flagged as transport by Top2Vec: 672


In [24]:
df[df["is_top2vec"] == True].tail()

Unnamed: 0,date,topic,text,term,clean_text,bertopic_topic,bertopic_prob,is_bertopic,lda_topic,lda_prob,is_lda,nmf_topic,nmf_prob,is_nmf,top2vec_topic,top2vec_score,is_top2vec
22723,17-06-2025,"18. EU framework conditions for competitive, e...",President. – The next item is the debate on th...,24_25,eu framework conditions competitive efficient ...,64,0.695874,True,34,0.255012,False,22,0.084211,True,15,0.804255,True
22724,17-06-2025,20. Latest developments on the revision of the...,The President.- The next point of the agenda f...,24_25,point follows explanations latest developments...,21,0.161534,True,93,0.553292,True,17,0.026698,False,41,0.836053,True
22866,02-04-2025,13. The importance of trans-European transport...,President.- The agenda bears the declarations ...,24_25,bears importance transe transport infrastructu...,-1,0.0,False,19,0.252174,True,22,0.063462,True,100,0.787515,True
23242,23-10-2024,18. Need to strengthen rail travel and the rai...,President.- The agenda is questioned by the Co...,24_25,questioned commissions theme need strengthen r...,69,0.315742,True,19,0.36362,True,22,0.067017,True,100,0.879598,True
23270,21-10-2024,14. Implementation of the Single European Sky ...,Chairwoman.- Another point of the daytime is t...,24_25,chairwoman point daytime recommendation drawn ...,21,0.87858,True,72,0.336446,True,22,0.026686,True,41,0.796716,True


In [25]:
# save updated dataframe with Top2Vec results (and other model results if they were already present)
# if needed as checkpoint
df.to_csv("data/csv/combined_terms_with_top2vec.csv", index=False)