In [1]:
import umap
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import hdbscan
import numpy as np
import re
import emoji
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired


In [2]:
def clean_tweet(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.EMOJI_DATA)
    tweet = tweet.replace("#", "").replace("_", " ").replace("RT ", "").replace('&amp;', '&') #Remove hashtag sign but keep the text
    return tweet

In [3]:
tweets_csv = pd.read_csv("../../tweets.csv")
tweets_csv_en = tweets_csv[tweets_csv["lang"] == "en"]

In [4]:
texts = tweets_csv_en["full_text"]
tweets = []
for t in texts:
    tweets.append(clean_tweet(t))

In [5]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model="all-mpnet-base-v2", vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
topics, probs = topic_model.fit_transform(tweets)
new_topics = topic_model.reduce_outliers(tweets, topics)
topic_model.update_topics(tweets, topics=new_topics)
info = topic_model.get_topic_info()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
topic_names = [topic_model.get_topic_info(t)["Name"] for t in topics]

In [7]:
topic_names = [t.values[0] for t in topic_names]

In [8]:
tweets_csv_en["topic"] = topic_names

In [12]:
tweets_csv_merged = tweets_csv.merge(tweets_csv_en[["id_str", "topic"]], on=["id_str"], how="outer")

In [19]:
tweets_csv_merged.to_csv("tweets_with_topic.csv", index=False)