In [1]:
import glob
import pandas as pd

from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
regions = ["UK", "US", "MiddleEast", "India", "Israel"]

In [3]:
def consolidate_links_data(region: str) -> None:
    df_list = []
    for month in glob.glob(f"../data/links/WayBackMachine/{region}/*"):
            for file in glob.glob(f"{month}/*.csv"):
                df_list.append(pd.read_csv(file))
    combined_df = pd.concat(df_list)
    combined_df.to_csv(f"../data/processed/WayBackMachine/{region}.csv", index=False)
            

In [None]:
for reg in regions:
    consolidate_links_data(reg)

In [22]:
selected_domains = {"UK":['dailymail.co.uk', 'independent.co.uk', 'theguardian.com', 'bbc.co.uk'],
"US":['nytimes.com', 'cbsnews.com', 'foxnews.com', 'nypost.com', 'npr.org', 'breitbart.com'],
"MiddleEast":['mehrnews.com', 'cumhuriyet.com.tr', 'dailysabah.com', 'aksam.com.tr'],
"India":['indiatimes.com', 'tribuneindia.com', 'hindustantimes.com', 'firstpost.com', 'thehindu.com'],
"Israel":['timesofisrael.com', 'jpost.com', 'ynetnews.com', 'israelhayom.co.il', 'maariv.co.il']}

In [23]:
for reg in regions:
    df = pd.read_csv(f"../data/links/WayBackMachine/consolidated/{reg}.csv")
    df = df[df.language == "en"]
    df = df[df.media_name.isin(selected_domains[reg])]
    print(reg, len(df))
    df.to_csv(f"../data/links/WayBackMachine/selected/{reg}.csv", index=False)

UK 10519
US 13879
MiddleEast 4010
India 32642
Israel 24532


In [4]:
all_df = []
for reg in regions:
    df = pd.read_csv(f"../data/links/WayBackMachine/selected/{reg}.csv")
    all_df.append(df)
all_df = pd.concat(all_df)

In [6]:
titles = all_df.title.values

In [7]:
topic_model = BERTopic(language="english", min_topic_size=100)
topics, probs = topic_model.fit_transform(titles)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
len(all_df)

85582

In [8]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,37461,-1_in_to_of_the,"[in, to, of, the, gaza, for, hamas, israel, an...",[Israel says war in Gaza could go on for month...
1,0,2478,0_sea_red_houthi_houthis,"[sea, red, houthi, houthis, yemen, ship, attac...",[Houthis claim new attacks on Red Sea shipping...
2,1,2219,1_harvard_students_antisemitism_university,"[harvard, students, antisemitism, university, ...",[Harvard sued by Jewish students over ‘rampant...
3,2,2169,2_the_we_our_you,"[the, we, our, you, my, and, war, of, cohen, m...",[We Must Have Hope That Everything Will Be Goo...
4,3,2157,3_iran_iraq_syria_us,"[iran, iraq, syria, us, drone, iranian, irans,...",[Iran strikes targets in northern Iraq and Syr...
...,...,...,...,...,...
106,105,108,105_grapevine_susan_sarandon_dropped,"[grapevine, susan, sarandon, dropped, taste, 2...",[Susan Sarandon Apologizes for Comment About J...
107,106,107,106_education_schools_exams_teachers,"[education, schools, exams, teachers, exam, st...","[Appearing for class 10, 12 board exams twice ..."
108,107,106,107_iran_irans_khamenei_hamas,"[iran, irans, khamenei, hamas, attack, war, is...",[Iran's Khamenei says Tehran was not behind Ha...
109,108,104,108_trump_trial_fraud_donald,"[trump, trial, fraud, donald, judge, trumps, i...",[Eye Opener: Former President Donald Trump cla...


In [10]:
topic_model.save("../models/", serialization="pytorch", save_ctfidf=True)