In [7]:
import umap
import os
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import hdbscan
import numpy as np
import re
import emoji
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired


In [8]:
def clean_tweet(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.EMOJI_DATA)
    tweet = tweet.replace("#", "").replace("_", " ").replace("RT ", "").replace('&amp;', '&') #Remove hashtag sign but keep the text
    return tweet

In [9]:
tweets_csv = pd.read_csv("../../tweets.csv")
tweets_csv_en = tweets_csv[tweets_csv["lang"] == "en"]

In [10]:
texts = tweets_csv_en["full_text"]
tweets = []
for t in texts:
    tweets.append(clean_tweet(t))

In [15]:
if os.path.exists("./topic.pkl"):
    topic_model = BERTopic.load("./topic.pkl", embedding_model="all-mpnet-base-v2")
else:
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    representation_model = KeyBERTInspired()
    vectorizer_model = CountVectorizer(stop_words="english")
    topic_model = BERTopic(embedding_model="all-mpnet-base-v2", vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model)
    topics, probs = topic_model.fit_transform(tweets)
    new_topics = topic_model.reduce_outliers(tweets, topics)
    topic_model.update_topics(tweets, topics=new_topics)
    topic_model.save("./topic.pkl", save_embedding_model=False)
info = topic_model.get_topic_info()

In [16]:
topic_names = [topic_model.get_topic_info(t)["Name"] for t in topics]
topic_names = [t.values[0] for t in topic_names]
tweets_csv_en["topic"] = topic_names
tweets_csv_merged = tweets_csv.merge(tweets_csv_en[["id_str", "topic"]], on=["id_str"], how="outer")
tweets_csv_merged.to_csv("tweets_with_topic.csv", index=False)

In [17]:
def get_users(text, df):
    similar_topics, similarity = topic_model.find_topics(text, top_n=3)
    topic_names = [topic_model.get_topic_info(t)["Name"] for t in similar_topics]
    topic_names = [t.values[0] for t in topic_names]
    return df[df["topic"].isin(topic_names)]['screen_name'].unique()

In [18]:
get_users("war", tweets_csv_merged)

array(['ShanghaiEye', 'ChineseCon_Mel', 'ChinaDaily', 'ChineseEmbinUS',
       'ChinaEmbInNZ', 'ChinaEmbNL', 'ChinaEmbKSA', 'YXiusheng',
       'ChinaInDenmark', 'GlobalTimesLife', 'ChinaEmbPeru', 'CathayPak',
       'ChinaEmbassy_MW', 'ziyi_zeng', 'ChinaEmbSL', 'PDChina',
       'ChinaAmbSA', 'ChinaAmbUN', 'ChinaEmbinRW', 'HU_Bin_CHN',
       'ChineseEmb_PNG', 'embassy_chinese', 'China_Amb_India',
       'ChenPingMFA', 'ChinaEmbKuwait', 'chinacgedi', 'CaoYi_MFA',
       'ChineAmbassade', 'XHNews', 'ChinaEmbPoland', 'Echinanews',
       'ChinaEmb_Juba', 'CdOpinion', 'TheLinkOnCGTN', 'jtao98',
       'zhanhao668', 'AmbQinGang', 'Ambassador_Liu', 'YuqiaoJi',
       'ZhengJunfeng1', 'PeoplesDailyapp', 'ChinaEmbinGH',
       'AmbassadorLei', 'haiwainet', 'dupingCHN', 'AmbLiuXiaoMing',
       'MahuiChina', 'AmbJiaGuide', 'china_emb_ng', 'SpokespersonCHN',
       'ChinaCG_CC', 'ChineseEmbinUK', 'HuXijin_GT', 'ChineseEmbTZ',
       'XIEYongjun_CHN', 'ModernExpressEN', 'ConsulateSan',
       '