In [9]:
import sys
import os

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
)  # Adjust as needed
sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts"))
)  # Adjust as needed
import pandas as pd
import numpy as np
from scripts.my_text_cleaning import clean_dataframe
from scripts.parallel_topic_model import deduplicate_text_and_embeddings
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from nltk.corpus import stopwords

In [10]:
chosen_dataset = "covid_tweets_en"
chosen_dataset = "ukraine_tweets_en"
chosen_dataset = "cop26_tweets_en"
ur_df = pd.read_parquet("./../../data/raw/" + chosen_dataset + ".parquet")
doc_info = pd.read_csv(
    "./../../data/processed/document_info_" + chosen_dataset + ".csv"
)[["Document", "Topic", "Representative_document", "Name"]]
topic_info = pd.read_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + ".csv"
)
if "Unnamed: 0" in topic_info.columns:
    topic_info.drop(columns=["Unnamed: 0"], inplace=True)
doc_info.Topic = doc_info.Topic.astype(int)
embeddings = np.load("./../../data/processed/" + chosen_dataset + ".parquet.npy")
topic_model = BERTopic.load("./../../models/with_hashtags/cop26_tweets_en.parquet.topic_model")
cln_df = clean_dataframe(
    ur_df, 
    'text',
    phrases_to_remove=["&gt;", "&lt;", "&amp;", "RT : "],
    remove_empty=False,
    remove_urls=True,
    normalize_hashtags=True,
    normalize_mentions=True,
    user_placeholder="user",
    strip_punctuation=False,
    lowercase=False,
    )
unique_docs, unique_embeddings = deduplicate_text_and_embeddings(cln_df, embeddings, 'Cleantext')
topic_model = BERTopic.load(
    "./../../models/" + chosen_dataset + ".parquet.topic_model",
    embedding_model="all-mpnet-base-v2",
)
print(f"{len(embeddings)=}")
print(f"{len(cln_df)=}")
print(f"{len(unique_docs)=}")
print(f"{len(doc_info)=}")
print(cln_df.columns)
print(len(unique_docs), len(unique_embeddings))



len(embeddings)=135636
len(cln_df)=135636
len(unique_docs)=105383
len(doc_info)=105383
Index(['id', 'author_id', 'created_at', 'lang', 'text', 'retweeted_id',
       'quoted_id', 'replied_id', 'url', 'expanded_url', 'hashtags',
       'retweet_count', 'reply_count', 'like_count', 'quote_count', 'username',
       'individual_or_org', 'category', 'Cleantext'],
      dtype='object')
105383 105383


In [11]:
embeddings, unique_embeddings = None, None

In [12]:
# Get English + Spanish stopwords
stopwords_en = stopwords.words("english")
stopwords_es = stopwords.words("spanish")
data_specific_stopwords = []
if chosen_dataset == "covid_tweets_en":
    data_specific_stopwords = [
        "covid",
        "covid19",
        "coronavirus",
        #"pandemic",
        #"virus",
        #"people",
        #"get",
        #"like",
        #"one",
        #"new",
        #"cases",
        #"health",
        #"vaccine",
        #"vaccines",
        #"vaccinated",
        #"deaths",
        #"time",
        #"year",
        #"day",
        #"years",
    ]
elif chosen_dataset == "ukraine_tweets_en":
    data_specific_stopwords = [
        "ukraine",
        "russia",
        #"war",
        #"russian",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"donbas",
        #"ukrainian",
        #"military",
        #"ukrainians",
        #"today",
    ]
elif chosen_dataset == "cop26_tweets_en":
    data_specific_stopwords = [
        "cop26",
        #"climate",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"action",
        #"change",
        #"global",
        #"world",
        #"new",
        #"need",
    ]
custom_stopwords = set(
    stopwords_en + stopwords_es + ["http", "https", "amp", "www", "com"] + ["user", 'rt'] + data_specific_stopwords
)
print(custom_stopwords)

{'teníamos', 'esta', 'don', 'estuvo', 'fuerais', 'tendrían', 'estábamos', 'tengas', 'doing', "should've", 'tú', 'estado', 'its', 'fui', 'habrían', 'should', 'en', 'somos', 'que', 'estéis', 'she', 'unos', 'tuviese', 'tendríamos', 'todo', 'contra', "you'll", "mustn't", 'los', 'con', 'up', 'vosotras', 'so', 'most', 'habidos', 'tuvieses', 'shan', 'sentido', 'vosotros', 'own', 'only', 'ya', "mightn't", 'estas', 'estaréis', 'estuvimos', 'hubiera', "he'd", 'than', 'out', 'user', 'i', "it's", 'itself', 'needn', "we're", 'those', 'estaban', 'is', 'fuiste', 'algo', 'before', 'ella', 'os', 'tuvieras', 'been', 'seáis', 'míos', 'erais', 'after', 'me', "couldn't", 'few', 'am', "we'll", 'because', 'suyo', 'isn', 'otras', 'under', 'al', 'com', 'fueras', 'he', 'as', 'estuviese', 'estaríais', 'su', 'him', 'estar', 'hasta', 'sintiendo', 'estarías', 'será', 'es', 'habríamos', "she'll", 'tendrías', 'not', 'my', 'our', 'tenéis', 'vuestra', 'couldn', 'habiendo', 'esto', 'nor', "shan't", 'tuvierais', 'above',

In [13]:
docs_clean = doc_info.Document.tolist()
topics = (doc_info.Topic.tolist())
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords))

representation_models = {
    "MMR": MaximalMarginalRelevance(diversity=0.7)
}

topic_model.update_topics(
    docs=docs_clean, topics=topics, vectorizer_model=vectorizer_model, representation_model=representation_models
)




In [14]:
new_topic_info = topic_model.get_topic_info()
new_topic_info['New_Name'] = new_topic_info.apply(
    lambda row: str(row['Topic']) + "_" + "_".join(row['MMR']),
    axis=1
)

new_topic_info

Unnamed: 0,Topic,Count,Name,Representation,MMR,Representative_Docs,New_Name
0,-1,41783,-1_climate_glasgow_world_change,"[climate, glasgow, world, change, action, summ...","[glasgow, world, change, today, new, climateac...",,-1_glasgow_world_change_today_new_climateactio...
1,0,7872,0_draft_cop_going_us,"[draft, cop, going, us, time, leaders, negotia...","[draft, going, leaders, plenary, week, world, ...",,0_draft_going_leaders_plenary_week_world_still...
2,1,5538,1_fossil_coal_fuel_oil,"[fossil, coal, fuel, oil, fuels, phase, gas, n...","[subsidies, industry, power, stopcambo, countr...",,1_subsidies_industry_power_stopcambo_countries...
3,2,2586,2_youth_young_people_children,"[youth, young, people, children, youth4climate...","[youth4climate, education, leaders, world, cli...",,2_youth4climate_education_leaders_world_climat...
4,3,2164,3_transport_cycling_electric_vehicles,"[transport, cycling, electric, vehicles, trave...","[electric, closures, glasgow, public, active, ...",,3_electric_closures_glasgow_public_active_mobi...
...,...,...,...,...,...,...,...
73,72,158,72_sgk_university_football_sustainability,"[sgk, university, football, sustainability, ed...","[sgk, university, sustainability, summer, maga...",,72_sgk_university_sustainability_summer_magazi...
74,73,158,73_rwanda_greenrwanda_investinrwanda_ngirente,"[rwanda, greenrwanda, investinrwanda, ngirente...","[rwanda, investment, green, edouard, delegatio...",,73_rwanda_investment_green_edouard_delegation_...
75,74,155,74_wheelchair_disabled_israeli_israel,"[wheelchair, disabled, israeli, israel, minist...","[israel, elharrar, karine, attend, disabilitie...",,74_israel_elharrar_karine_attend_disabilities_...
76,75,153,75_hydrogen_green_ambulance_energy,"[hydrogen, green, ambulance, energy, forrest, ...","[ambulance, scotlandisnow, zero, reneweconomy,...",,75_ambulance_scotlandisnow_zero_reneweconomy_s...


In [15]:
merged_topic_info = topic_info.drop(columns=["Representation"]).merge(
    new_topic_info[["Topic", 'MMR', "New_Name", "Representation"]], on="Topic", suffixes=("_old", "_new")
)
merged_topic_info

Unnamed: 0,Topic,Count,Name,Representative_Docs,MMR,New_Name,Representation
0,-1,41783,-1_user_cop26_climate_rt,"['RT user: Cop26 climate strike', 'RT user: us...","[glasgow, world, change, today, new, climateac...",-1_glasgow_world_change_today_new_climateactio...,"[climate, glasgow, world, change, action, summ..."
1,0,7872,0_user user_user_cop26_rt user,['user user user user user user user user user...,"[draft, going, leaders, plenary, week, world, ...",0_draft_going_leaders_plenary_week_world_still...,"[draft, cop, going, us, time, leaders, negotia..."
2,1,5538,1_fossil_coal_fossil fuel_fuel,['user user user says if the final COP26 text ...,"[subsidies, industry, power, stopcambo, countr...",1_subsidies_industry_power_stopcambo_countries...,"[fossil, coal, fuel, oil, fuels, phase, gas, n..."
3,2,2586,2_youth_young_young people_people,['RT user: Young people like me have made hist...,"[youth4climate, education, leaders, world, cli...",2_youth4climate_education_leaders_world_climat...,"[youth, young, people, children, youth4climate..."
4,3,2164,3_transport_cycling_electric_vehicles,"[""It's Transport Day COP26. Cycling is a cost-...","[electric, closures, glasgow, public, active, ...",3_electric_closures_glasgow_public_active_mobi...,"[transport, cycling, electric, vehicles, trave..."
...,...,...,...,...,...,...,...
73,72,158,72_sgk planet_sgk_university_education university,['SGK-PLANET MAGAZINE SERIES: The ParisAgreeme...,"[sgk, university, sustainability, summer, maga...",72_sgk_university_sustainability_summer_magazi...,"[sgk, university, football, sustainability, ed..."
74,73,158,73_rwanda_greenrwanda_investinrwanda_greenrwan...,['Attending the COP26 UN Climate Change Summit...,"[rwanda, investment, green, edouard, delegatio...",73_rwanda_investment_green_edouard_delegation_...,"[rwanda, greenrwanda, investinrwanda, ngirente..."
75,74,155,74_wheelchair_disabled_israeli_minister,['RT user: UK apologises to Israeli minister f...,"[israel, elharrar, karine, attend, disabilitie...",74_israel_elharrar_karine_attend_disabilities_...,"[wheelchair, disabled, israeli, israel, minist..."
76,75,153,75_hydrogen_green hydrogen_green_energy,"[""Opening our Climate Ambition Zone hydrogen e...","[ambulance, scotlandisnow, zero, reneweconomy,...",75_ambulance_scotlandisnow_zero_reneweconomy_s...,"[hydrogen, green, ambulance, energy, forrest, ..."


In [16]:
merged_topic_info.to_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + "_with_MMR.csv", index=False
)