In [1]:
import umap
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import hdbscan
import numpy as np
import re
import emoji
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired


In [2]:
def clean_tweet(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.EMOJI_DATA)
    tweet = tweet.replace("#", "").replace("_", " ").replace("RT ", "").replace('&amp;', '&') #Remove hashtag sign but keep the text
    return tweet

In [3]:
tweets_csv = pd.read_csv("../../tweets.csv")
tweets_csv_en = tweets_csv[tweets_csv["lang"] == "en"]

In [4]:
texts = tweets_csv_en["full_text"]
tweets = []
for t in texts:
    tweets.append(clean_tweet(t))

In [5]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model="all-mpnet-base-v2", vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
topics, probs = topic_model.fit_transform(tweets)
new_topics = topic_model.reduce_outliers(tweets, topics)
topic_model.update_topics(tweets, topics=new_topics)
info = topic_model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
topic_names = [topic_model.get_topic_info(t)["Name"] for t in topics]
topic_names = [t.values[0] for t in topic_names]
tweets_csv_en["topic"] = topic_names
tweets_csv_merged = tweets_csv.merge(tweets_csv_en[["id_str", "topic"]], on=["id_str"], how="outer")
tweets_csv_merged.to_csv("tweets_with_topic.csv", index=False)

In [20]:
similar_topics, similarity = topic_model.find_topics("president", top_n=5)
topic_model.get_topic(similar_topics[4])

[('treasury', 0.06089266924581921),
 ('airport', 0.05462471185117023),
 ('janet', 0.05049849812441027),
 ('governor', 0.048536289681674345),
 ('yellen', 0.04770318271514561),
 ('afternoon', 0.04584626549520099),
 ('newsom', 0.04431454639487716),
 ('california', 0.043920964656346845),
 ('gavin', 0.040888442327267675),
 ('officials', 0.03808765040131693)]

In [10]:
info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,11,-1_itiswhatitis_texte_complet_sacked,"[itiswhatitis, texte, complet, sacked, invigor...",[The 2022 GDP of southwest China's Xizang Auto...
1,0,559,0_gaza_israel_israeli_palestine,"[gaza, israel, israeli, palestine, palestinian...",[China is deeply saddened by the civilian casu...
2,1,288,1_music_dance_orchestra_opera,"[music, dance, orchestra, opera, philadelphia,...","[On the afternoon of November 11, the Philadel..."
3,2,230,2_covid19_covid_coronavirus_virus,"[covid19, covid, coronavirus, virus, vaccine, ...","[Amidst the novel coronavirus epidemic, discus..."
4,3,226,3_space_shenzhou_crew_station,"[space, shenzhou, crew, station, astronauts, m...",[China on Thursday launched the Shenzhou-17 ma...
...,...,...,...,...,...
335,334,36,334_schumer_leader_senate_majority,"[schumer, leader, senate, majority, delegation...",[What has transpired so far this year suggests...
336,335,28,335_francisco_san_g20_biden,"[francisco, san, g20, biden, arrived, joe, tue...",[Chinese President Xi Jinping's trip to San Fr...
337,336,33,336_beach_birthday_colleague_smiles,"[beach, birthday, colleague, smiles, moments, ...",[dailypost Celebrating a double delight today!...
338,337,34,337_keynote_interconnected_inclusive_speech,"[keynote, interconnected, inclusive, speech, o...","[Building an Open, Inclusive and Interconnecte..."
