In [7]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from topic_clustering.cluster import GraphCluster

## Prepare the toy-dataset

In [8]:
df = pd.read_csv("dataset/toy_dataset.csv")
df.head(1)

Unnamed: 0,tweet_id,url,author,publish_time,content
0,1492891431379341324,https://twitter.com/peterpobjecky/status/14928...,"{""user_id"": 771820890, ""name"": ""peter pobjecky...",2022-02-14 00:00:28,@CorkyBottle @GeromanAT @YouTube Those nations...


In [9]:
url_pattern = re.compile("(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})")
at_pattern = re.compile("@[^ ]+")
hash_pattern = re.compile("#[^ ]+")

pattern_to_token = {
    url_pattern: "",
    at_pattern: "",
    hash_pattern: ""
}

def clean_text(text,):
    for pattern, token in pattern_to_token.items():
        text = re.sub(pattern, token, text)
    return text

In [10]:
text_column = "content"
cleaned_texts = df[text_column].astype(str).apply(clean_text)

## TFIDF Vectorization

In [11]:
corpus = set(cleaned_texts)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=10,
                                   lowercase=True, max_features=200000)
tfidf_vectorizer.fit(corpus)
vectors = tfidf_vectorizer.transform(cleaned_texts)

## Clustering

In [12]:
%%time
graph_cluster = GraphCluster()
corr_threshold = 0.6
group_to_ids = graph_cluster.cluster(vectors, min_corr=corr_threshold)

CPU times: user 18 s, sys: 521 ms, total: 18.5 s
Wall time: 18.5 s


In [13]:
id_to_groups = [-1 for _ in range(df.shape[0])]
for g_id, article_ids in group_to_ids.items():
    for a_id in article_ids:
        id_to_groups[a_id] = g_id
df['group_id'] = id_to_groups

In [14]:
df.group_id.value_counts()

1       290
2        66
3        61
4        60
5        56
       ... 
4847      1
749       1
2796      1
4843      1
2047      1
Name: group_id, Length: 5051, dtype: int64

In [15]:
def explore_group(df, group_id, column, size=5):
    rows = df[df.group_id == group_id][column].values[:size]
    for r in rows:
        print(r)
        print("-" * 30)

In [16]:
explore_group(df, group_id=1, column=text_column)

Ukraine soldiers capture a Russian commander, His confession shocks the World (Video) 
https://t.co/JpXc05NSCK
The Russian
The Taliban#putinisawarcriminal
#Ukraineunderattack
#freeukraine
#StopRussianAggresion
Cyprus#nuclearwar
#PutinWarCriminal
Belarus
#AbolishNato
------------------------------
Ukraine soldiers capture a Russian commander, His confession shocks the World (Video) 
https://t.co/JpXc05NSCK

Ukrainian 
Putin 
#Worldwar3 
#WWIII 
NATO 
Taiwan 
America 
China 
Trump 
Biden
#nuclearwar
#PutinWarCriminal
Belarus
#AbolishNato 
NATO Alliance 
Slovenia
------------------------------
Ukraine soldiers capture a Russian commander, His confession shocks the World (Video) 
https://t.co/pnRodR3ZXR

Ukrainian 
Putin 
#Worldwar3 
#WWIII 
NATO 
Taiwan 
America 
China 
Trump 
Biden
#nuclearwar
#PutinWarCriminal
Belarus
#AbolishNato 
NATO Alliance 
Slovenia
------------------------------
Ukraine soldiers capture a Russian commander, His confession shocks the World (Video) 
https://t.co/v7

In [17]:
explore_group(df, group_id=2, column=text_column,)

@manni_1986 @unapologeticAnk He is the trudeau of Ukraine. Bloody wokes
------------------------------
@TheRevAl Nobody, not even Ukraine is buying this theatre!!!! https://t.co/5R5F3Hkv5b
------------------------------
@SpiroAgnewGhost He extorted Ukraine, which resulted in his first impeachment.
------------------------------
@Honeybee0427 @justinamash You do though in Ukraine?
------------------------------
Ukraine 🇺🇦
------------------------------


In [18]:
explore_group(df, group_id=3, column=text_column,)

@taken9000 @NewZi00484428 @VaushV Fuck you and fuck ukraine, that country was made by Russia and they are all Russians on cocaine from neo nazi propaganda that you subscribe to.
------------------------------
@CBSNews Well, fuck you due to Ukraine.
------------------------------
@DeereIsGod @colethemanyt fuck ukraine
------------------------------
@colethemanyt fuck ukraine
------------------------------
@noahmrcd19 @colethemanyt fuck ukraine
------------------------------


In [19]:
explore_group(df, group_id=4, column=text_column,)

I stand with Ukraine 🇺🇦
------------------------------
@Reuters I stand with Ukraine ✊
------------------------------
@Reuters I stand with Ukraine 🇺🇦✊
------------------------------
@Reuters I stand with Ukraine 🇺🇦✊
------------------------------
@ZelenskyyUa @EmmanuelMacron @OlafScholz I stand with Ukraine 🇺🇦✊
------------------------------


In [20]:
explore_group(df, group_id=5, column=text_column,)

@MiddleEastMnt If you support Ukraine But not Palestine Iraq Syria Afghanistan Somalia Libya Kashmir and Iraq.
If you condemn Russia But not Israel and US.
Then congratulations mate, you are on the highest tier of hypocrisy. 

#UkraineRussiaWar #RussiaUkraineCrisis
#Ukraine #RussianUkrainianWar https://t.co/T6MBxxokT5
------------------------------
@Quicktake If you support Ukraine But not Palestine Iraq Syria Afghanistan Somalia Libya Kashmir and Iraq.
If you condemn Russia But not Israel and US.
Then congratulations mate, you are on the highest tier of hypocrisy. 

#UkraineRussiaWar #RussiaUkraineCrisis
#Ukraine #Russia https://t.co/Pi4icUoOM7
------------------------------
@Reuters If you support Ukraine But not Palestine Iraq Syria Afghanistan Somalia Libya Kashmir and Iraq.
If you condemn Russia But not Israel and US.
Then congratulations mate, you are on the highest tier of hypocrisy. 

#UkraineRussiaWar #RussiaUkraineCrisis
#Ukraine #Russia https://t.co/EhDpOBKMqb
--------------

## Output the result

In [25]:
df.to_csv("outputs/tweets_with_group_ids.csv", index=False)