In [None]:
import pandas as pd
import torch
import numpy as np
import tqdm
from bertopic import BERTopic

### Global topic Modeling Task - Analysing each commmunity globally

In [13]:
text_df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')
community_df = pd.read_csv('../../../src/data/distribuitions/hub_bridge_df.csv')

In [14]:
merged_df = text_df.merge(community_df, left_on='author', right_on='id', how='inner')

# Community grouping and text aggregation
community_texts = (
    merged_df.groupby('community_id')['clean_text'].apply(list) #getting texts list
    .reset_index()
)

# Alternativelly for a string
# .apply(lambda x: " ".join(x))

In [15]:
community_texts

Unnamed: 0,community_id,clean_text
0,0.0,[trump announced new weapons ukraine monday th...
1,1.0,[trump doge making massive cuts federal workfo...
2,2.0,[every friday posts withheld review moderators...
3,3.0,[terrorist defined person uses unlawful violen...
4,4.0,[usually hears controlled opposition it s rega...
...,...,...
73,73.0,[point 2 say reddit prides open discourse howe...
74,74.0,[ok disney hot hectic exhausting often unneces...
75,75.0,[point four say something anything rapist jail...
76,76.0,[nta complaining ready step pay additional ser...


In [21]:
community_0 = merged_df.loc[merged_df['community_id'] == 0.0]
community_0['clean_text'] = community_0['clean_text'].astype(str)

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(community_0['clean_text'].tolist())



In [23]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,12576,-1_trump_us_right_iran,"[trump, us, right, iran, going, way, party, mu...",[immigration enforcement driving wages goal tr...
1,0,393,0_moderate_progressive_progressives_democrats,"[moderate, progressive, progressives, democrat...",[wrote somewhere else yes lot progressive poli...
2,1,319,1_tariffs_trade_tariff_manufacturing,"[tariffs, trade, tariff, manufacturing, countr...",[reciprocal tariffs often lead tariffs countri...
3,2,307,2_tax_taxes_income_federal,"[tax, taxes, income, federal, rate, pay, rates...",[can t eliminate deficit level taxation rich f...
4,3,301,3_thanks_thank_ok_yeah,"[thanks, thank, ok, yeah, okay, oh, question, ...","[interesting thank, ok gotcha thanks, yeah thi..."
...,...,...,...,...,...
241,240,11,240_economists_economics_monopolies_askeconomics,"[economists, economics, monopolies, askeconomi...",[economists glorified tarot card readers say e...
242,241,11,241_contempt_court_comply_order,"[contempt, court, comply, order, jail, ordered...",[shit post article states administration cited...
243,242,10,242_regime_change_split_south,"[regime, change, split, south, diem, holland, ...",[question practical strategic peaceful steps m...
244,243,10,243_plan_cbo_civically_peoplemwho,"[plan, cbo, civically, peoplemwho, pf, sellout...",[peoplemwho know away shit rewarding friends e...


In [25]:
topic_model.get_topic(-1)

[('trump', np.float64(0.0038465007919400466)),
 ('us', np.float64(0.0031451766527744282)),
 ('right', np.float64(0.003023903134446666)),
 ('iran', np.float64(0.0029418798153620055)),
 ('going', np.float64(0.0026531606211747144)),
 ('way', np.float64(0.0026448898537439315)),
 ('party', np.float64(0.0026392663035120006)),
 ('much', np.float64(0.002616948610481152)),
 ('make', np.float64(0.0025407126947250723)),
 ('know', np.float64(0.002537601302137159))]