In [None]:
import sys
import os

sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
)  # Adjust as needed
sys.path.append(
    os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts"))
)  # Adjust as needed
import pandas as pd
import numpy as np
from scripts.my_text_cleaning import clean_dataframe
from scripts.parallel_topic_model import deduplicate_text_and_embeddings
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from nltk.corpus import stopwords

In [2]:
chosen_dataset = "covid_tweets_en"
chosen_dataset = "cop26_tweets_en"
chosen_dataset = "ukraine_tweets_en"
ur_df = pd.read_parquet("./../../data/raw/" + chosen_dataset + ".parquet")
doc_info = pd.read_csv(
    "./../../data/processed/document_info_" + chosen_dataset + ".csv"
)[["Document", "Topic", "Representative_document", "Name"]]
topic_info = pd.read_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + ".csv"
)
if "Unnamed: 0" in topic_info.columns:
    topic_info.drop(columns=["Unnamed: 0"], inplace=True)
doc_info.Topic = doc_info.Topic.astype(int)
embeddings = np.load("./../../data/processed/" + chosen_dataset + ".parquet.npy")
topic_model = BERTopic.load("./../../models/with_hashtags/cop26_tweets_en.parquet.topic_model")
cln_df = clean_dataframe(
    ur_df, 
    'text',
    phrases_to_remove=["&gt;", "&lt;", "&amp;", "RT : "],
    remove_empty=False,
    remove_urls=True,
    normalize_hashtags=True,
    normalize_mentions=True,
    user_placeholder="user",
    strip_punctuation=False,
    lowercase=False,
    )
unique_docs, unique_embeddings = deduplicate_text_and_embeddings(cln_df, embeddings, 'Cleantext')
topic_model = BERTopic.load(
    "./../../models/" + chosen_dataset + ".parquet.topic_model",
    embedding_model="all-mpnet-base-v2",
)
print(f"{len(embeddings)=}")
print(f"{len(cln_df)=}")
print(f"{len(unique_docs)=}")
print(f"{len(doc_info)=}")
print(cln_df.columns)
print(len(unique_docs), len(unique_embeddings))



len(embeddings)=916955
len(cln_df)=916955
len(unique_docs)=787872
len(doc_info)=787872
Index(['id', 'author_id', 'created_at', 'lang', 'in_reply_to_user_id',
       'conversation_id', 'text', 'reply_settings', 'possibly_sensitive',
       'retweeted_id', 'quoted_id', 'replied_id', 'url', 'expanded_url',
       'mention_name', 'hashtags', 'retweet_count', 'reply_count',
       'like_count', 'quote_count', 'username', 'individual_or_org',
       'category', 'Cleantext'],
      dtype='object')
787872 787872


In [3]:
embeddings, unique_embeddings = None, None

In [4]:
# Get English + Spanish stopwords
stopwords_en = stopwords.words("english")
stopwords_es = stopwords.words("spanish")
data_specific_stopwords = []
if chosen_dataset == "covid_tweets_en":
    data_specific_stopwords = [
        "covid",
        "covid19",
        "coronavirus",
        #"pandemic",
        #"virus",
        #"people",
        #"get",
        #"like",
        #"one",
        #"new",
        #"cases",
        #"health",
        #"vaccine",
        #"vaccines",
        #"vaccinated",
        #"deaths",
        #"time",
        #"year",
        #"day",
        #"years",
    ]
elif chosen_dataset == "ukraine_tweets_en":
    data_specific_stopwords = [
        "ukraine",
        "russia",
        #"war",
        #"russian",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"donbas",
        #"ukrainian",
        #"military",
        #"ukrainians",
        #"today",
    ]
elif chosen_dataset == "cop26_tweets_en":
    data_specific_stopwords = [
        "cop26",
        #"climate",
        #"people",
        #"like",
        #"one",
        #"get",
        #"just",
        #"know",
        #"time",
        #"day",
        #"year",
        #"years",
        #"action",
        #"change",
        #"global",
        #"world",
        #"new",
        #"need",
    ]
custom_stopwords = set(
    stopwords_en + stopwords_es + ["http", "https", "amp", "www", "com"] + ["user", 'rt'] + data_specific_stopwords
)
print(custom_stopwords)

{'o', 'una', 'son', 'so', "we'll", 'al', 'le', 'tendré', "didn't", 'otros', 'seré', 'hubieseis', 'hube', 'tendrían', 'habrían', 'tuvisteis', "weren't", 'is', 'tenidas', 't', "you'll", 'hers', 'your', 'seremos', 'estuvieron', 'ours', 'sean', 'estuve', 'd', "they'd", 'tuvierais', 'seréis', 'seáis', 'habéis', 'myself', 'yourselves', 'por', "they've", 'habida', 'estuvo', 'tendríais', 'ella', 'nuestras', 'eran', 'shan', 'sentidos', 'tendrás', "he'll", 'nada', 'because', 'serás', 'then', 'esas', 'fuéramos', "shan't", 'fueron', 'me', 'estuvisteis', 'mightn', 'lo', 'serán', 'seríais', "should've", 'la', 'soy', "they're", 'tuvimos', 'ma', 'teniendo', 'should', 'esta', 'esos', 'ti', 'estarías', 'hubieron', 'en', 'seas', 'at', 'by', 'user', 'of', 'estuvimos', 'to', 'mías', 'we', 'hasta', 'él', 'any', 'down', 'did', "aren't", "shouldn't", 'been', 'había', 'haven', 'won', 'once', 'tendréis', 'estuviesen', 'for', 'estaría', 'esa', 'it', 'también', 'sería', 'están', 'habíais', 'se', 'este', 'hubiésem

In [5]:
docs_clean = doc_info.Document.tolist()
topics = (doc_info.Topic.tolist())
vectorizer_model = CountVectorizer(stop_words=list(custom_stopwords))

representation_models = {
    "MMR": MaximalMarginalRelevance(diversity=0.7)
}

topic_model.update_topics(
    docs=docs_clean, topics=topics, vectorizer_model=vectorizer_model, representation_model=representation_models
)




In [6]:
new_topic_info = topic_model.get_topic_info()
new_topic_info['New_Name'] = new_topic_info.apply(
    lambda row: str(row['Topic']) + "_" + "_".join(row['MMR']),
    axis=1
)

new_topic_info

Unnamed: 0,Topic,Count,Name,Representation,MMR,Representative_Docs,New_Name
0,-1,382950,-1_russian_war_ukrainian_putin,"[russian, war, ukrainian, putin, us, military,...","[war, one, new, ukrainians, support, today, sa...",,-1_war_one_new_ukrainians_support_today_said_a...
1,0,20528,0_forces_kherson_offensive_troops,"[forces, kherson, offensive, troops, kharkiv, ...","[forces, kherson, liberated, severodonetsk, se...",,0_forces_kherson_liberated_severodonetsk_settl...
2,1,17306,1_putin_west_kremlin_propaganda,"[putin, west, kremlin, propaganda, twitter, wa...","[putin, media, win, western, like, world, end,...",,1_putin_media_win_western_like_world_end_must_...
3,2,17266,2_thread_read_good_great,"[thread, read, good, great, thank, tweet, oh, ...","[thread, read, tweet, true, yes, piece, time, ...",,2_thread_read_tweet_true_yes_piece_time_lol_im...
4,3,16111,3_tank_destroyed_captured_artillery,"[tank, destroyed, captured, artillery, brigade...","[tank, captured, btr, oblast, video, atgm, stu...",,3_tank_captured_btr_oblast_video_atgm_stugna_m...
...,...,...,...,...,...,...,...
134,133,543,133_passports_citizenship_passport_decree,"[passports, citizenship, passport, decree, res...","[occupied, simplified, passportization, region...",,133_occupied_simplified_passportization_region...
135,134,542,134_poll_support_levada_polls,"[poll, support, levada, polls, russians, opini...","[levada, majority, center, data, putin, sociol...",,134_levada_majority_center_data_putin_sociolog...
136,135,513,135_art_artist_mural_artists,"[art, artist, mural, artists, exhibition, pain...","[murals, created, repin, works, cultural, kyiv...",,135_murals_created_repin_works_cultural_kyiv_w...
137,136,503,136_christmas_merry_santa_celebrate,"[christmas, merry, santa, celebrate, tree, hal...","[christmas, tree, nicholas, eve, orthodox, st,...",,136_christmas_tree_nicholas_eve_orthodox_st_uk...


In [7]:
merged_topic_info = topic_info.drop(columns=["Representation"]).merge(
    new_topic_info[["Topic", 'MMR', "New_Name", "Representation"]], on="Topic", suffixes=("_old", "_new")
)
merged_topic_info

Unnamed: 0,Topic,Count,Name,Representative_Docs,MMR,New_Name,Representation
0,-1,382950,-1_user_ukraine_rt_rt user,"['user user user Putin winning 👇👇👇🤣🤣🤣', 'user ...","[war, one, new, ukrainians, support, today, sa...",-1_war_one_new_ukrainians_support_today_said_a...,"[russian, war, ukrainian, putin, us, military,..."
1,0,20528,0_forces_kherson_offensive_russian forces,"[""On the Svatove-Kreminna battlefront, Ukraini...","[forces, kherson, liberated, severodonetsk, se...",0_forces_kherson_liberated_severodonetsk_settl...,"[forces, kherson, offensive, troops, kharkiv, ..."
2,1,17306,1_putin_user_west_user user,"['RT user: user pUtIn PuPpEt!', 'user There is...","[putin, media, win, western, like, world, end,...",1_putin_media_win_western_like_world_end_must_...,"[putin, west, kremlin, propaganda, twitter, wa..."
3,2,17266,2_user_user user_rt_rt user,"['RT user: THIS⬇️⬇️⬇️', 'RT user: THIS!! 👇🏽👇🏽👇...","[thread, read, tweet, true, yes, piece, time, ...",2_thread_read_tweet_true_yes_piece_time_lol_im...,"[thread, read, good, great, thank, tweet, oh, ..."
4,3,16111,3_tank_destroyed_ukrainian_captured,['RT user: Ukraine: A Russian BMP-1 infantry f...,"[tank, captured, btr, oblast, video, atgm, stu...",3_tank_captured_btr_oblast_video_atgm_stugna_m...,"[tank, destroyed, captured, artillery, brigade..."
...,...,...,...,...,...,...,...
134,133,543,133_passports_citizenship_passport_russian pas...,['RT user: Russian President Putin signed a de...,"[occupied, simplified, passportization, region...",133_occupied_simplified_passportization_region...,"[passports, citizenship, passport, decree, res..."
135,134,542,134_poll_support_russians support_russians,['RT user: ⚡️ Poll: 83% of Russians support Pu...,"[levada, majority, center, data, putin, sociol...",134_levada_majority_center_data_putin_sociolog...,"[poll, support, levada, polls, russians, opini..."
136,135,513,135_art_artist_mural_artists,['RT user: A mural dedicated to Ukraine in Chi...,"[murals, created, repin, works, cultural, kyiv...",135_murals_created_repin_works_cultural_kyiv_w...,"[art, artist, mural, artists, exhibition, pain..."
137,136,503,136_christmas_merry_merry christmas_santa,"['Merry Christmas, I’m from Ukraine! 🇺🇦 Enjoy ...","[christmas, tree, nicholas, eve, orthodox, st,...",136_christmas_tree_nicholas_eve_orthodox_st_uk...,"[christmas, merry, santa, celebrate, tree, hal..."


In [8]:
merged_topic_info.to_csv(
    "./../../data/processed/topic_info_" + chosen_dataset + "_with_MMR.csv", index=False
)