# Dataset

Choose a dataset to cluster

In [1]:
DATASET = 'dataset70000.csv'

In [2]:
%load_ext autoreload
%autoreload 2

import os
from pathlib import Path
from multiprocessing import cpu_count
import time

import pandas as pd
import numpy as np
import torch
from sklearn.metrics import adjusted_rand_score

from constants import SEED
from utils import load_dataset, save_dataset
from utils import model_path, format_list, format_counter, extract_name
from utils import Timer
from evaluation import match_cluster_labels, compute_metrics, metrics_to_df, EvaluationResults

# pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Path('models').mkdir(parents=True, exist_ok=True)  # for saving models

rng = np.random.default_rng(seed=SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
N_CPU = cpu_count()
er = EvaluationResults(extract_name(DATASET), create_new=False)
timer = Timer()

def get_pipeline_label(clustering_label):
    return text_embedding_label + '+' + ((reduction_label + '+') if reduction_label != '' else '') + clustering_label

print(f'cpu: {N_CPU}')

cpu: 8


In [3]:
dataset = load_dataset(DATASET)
print(f'dataset contains {dataset.shape[0]} samples and {dataset.shape[1]} features')

dataset contains 70406 samples and 3 features


Topic distribution

In [4]:
dataset.groupby('topic').nunique()[['text']]

Unnamed: 0_level_0,text
topic,Unnamed: 1_level_1
airline support,8468
australian elections,10000
chatgpt,10000
climate change,10000
covid19,10000
fifa world cup,6854
self-driving cars,4497
stock market crash,10000
weather,587


Explore random tweets

In [5]:
dataset[['text', 'topic']].sample(5)
# dataset[dataset['topic'] == 'stock market crash'][['text', 'topic']].sample(5)

Unnamed: 0,text,topic
11734,"The unintended consequences of driverless cars may be bigger cities, less fuel. Cool story in @qz | http://t.co/IUqfpLO5sg | via @mims #IoT",self-driving cars
5918,Nigerian government ratifies two global treaties on climate change - Official https://t.co/0VTZYoXgxA,climate change
52338,So what we have to learn about & #Crypto?\n\nImho:\nPoints 4 and 5 demonstrate the importance of cryptocurrencies and crypto assets in today's global economy.,chatgpt
28835,Can someone explain me that offside call?,fifa world cup
70319,Market Makers made trillions in 2021 from bulls buying calls that expired OTM \n\nLet’s not make this mistake again. \n\n#stocks $bbig $amc $gme,stock market crash


In [6]:
topics = dataset['topic']
topics_unique, clusters_true = np.unique(topics, return_inverse=True)
n_tweets = len(dataset['text'])
n_topics = len(topics_unique)

# Preprocessing

Prepare regex, tokenizer and lemmatizer

In [7]:
import itertools
from collections import Counter

import regex as re
import spacy
from spacy.tokenizer import _get_regex_pattern
from sklearn.feature_extraction.text import CountVectorizer
import emoji

LOWERCASE_ALPHABET = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}
ALLOWED_CHARS = LOWERCASE_ALPHABET.union(set('#@%'))

def filter_string(s):
    return ''.join(c for c in s if c in ALLOWED_CHARS or emoji.is_emoji(c))
    
nlp = spacy.load('en_core_web_md', exclude=['toc2vec', 'parser', 'ner'])
re_token_match = _get_regex_pattern(nlp.Defaults.token_match)

re_token_match = f'({re_token_match}|#\S+|@\S+)'  # match hashtags, mentions as one token
nlp.tokenizer.token_match = re.compile(re_token_match).match

re_link = "(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
re_link = re.compile('({re_link}|http\S*)')
re_number = re.compile('\d+(?:[\./]\d+)?')

Tokenize/preprocess tweets

In [8]:
MIN_DF = 3
MAX_DF = 0.5
MOST = 10

tweets = dataset['text'].tolist()
tweets_tokenized = [None] * n_tweets

timer.start()
for i, tweet in enumerate(nlp.pipe(dataset['text'], n_process=-1, batch_size=1024)):
    tokens = [t.lemma_.lower() for t in tweet if not t.is_stop]
    tokens = [re.sub(re_link, '%link', t) for t in tokens]
    tokens = [re.sub(re_number, '%number', t) for t in tokens]
    tokens = [filter_string(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 1]
    tweets_tokenized[i] = tokens
print(f'elapsed time: {timer.pause():.0f}s')

cv = CountVectorizer(
    min_df=MIN_DF,
    max_df=MAX_DF,
    lowercase=False, 
    tokenizer=lambda x: x,
    token_pattern=None,
)
cv.fit(tweets_tokenized)
vocabulary = cv.vocabulary_
tweets_tokenized = [[t for t in ts if t in vocabulary] for ts in tweets_tokenized]

counter = Counter(itertools.chain.from_iterable(tweets_tokenized))
print(f'corpus size: {sum(counter.values())}')
print(f'vocabulary size: {len(counter.keys())}')
print(f'most common tokens: {format_counter(counter.most_common(MOST))}')
print(f'least common tokens: {format_counter(counter.most_common()[-MOST:])}')

elapsed time: 45s
corpus size: 956267
vocabulary size: 16264
most common tokens: %number:23132, %link:13585, change:9987, climate:8428, people:6317, chat:5527, gpt:5504, like:5360, time:5227, market:4980
least common tokens: #votethemallout%number:3, #utility:3, #dpro:3, doge:3, @ethernitychain:3, @lindyli:3, traders:3, #nftcollector:3, @cryptoworld%number:3, #yieldfarming:3


Was the preprocessing step done correctly? Check specific tweets manually

In [9]:
# id = 7007
id = rng.integers(0, n_tweets)
print(dataset.loc[id, 'text'] + '\n')
for t in tweets_tokenized[id]:
    print('\'' + t + '\'', end=' ')
print(f'\n\ntweet id: {id}')

Artificial Intelligence has everybody talking, thanks to #AI tools like #ChatGPT which have shown massive user growth. To understand how the rise of AI could be captured by ETF investors, our latest podcast invited a very special guest: ChatGPT. https://t.co/uJxykBbDh3

'artificial' 'intelligence' 'everybody' 'talk' 'thank' '#ai' 'tool' 'like' '#chatgpt' 'show' 'massive' 'user' 'growth' 'understand' 'rise' 'ai' 'capture' 'etf' 'investor' 'late' 'podcast' 'invite' 'special' 'guest' 'chatgpt' '%link' 

tweet id: 59889


# End-to-end Approaches

### Latent Dirichlet Allocation

In [10]:
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
label='lda'
embeddings = None

timer.start(0)
id2word = corpora.Dictionary(tweets_tokenized)
corpus = [id2word.doc2bow(tweet_tokens) for tweet_tokens in tweets_tokenized]
lda = LdaModel(
    corpus, n_topics, id2word,
    passes=10, 
    # iterations=100,
    # alpha='auto',
    random_state=SEED,
)
clusters = np.argmax(gensim.matutils.corpus2csc(lda.get_document_topics(corpus)).T.toarray(), axis=-1)
print(f'elapsed time: {timer.pause(0):.0f}s')
print(f'ari: {adjusted_rand_score(clusters_true, clusters)}')
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')

elapsed time: 86s
ari: 0.23699867930695967
cluster distribution: [12270  3995 14213  5638  2021  7598 11042  3758  9871]


# Text Embeddings

### TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
text_embedding_label = 'tfidf'

timer.start(0)
tfidf = TfidfVectorizer(
    min_df=3,
    max_df=0.5,
    sublinear_tf=True,
    tokenizer=lambda x: x, lowercase=False,  # use custom tokenization (and pre-processing)
)
embedds_raw = tfidf.fit_transform(tweets_tokenized)
print(f'elapsed time: {timer.pause(0):.0f}s')

print(f'embedding size: {embedds_raw.shape[1]}')
print(f'most common stop words (they have low DF): {format_counter(Counter({t: counter.get(t, 0) for t in tfidf.stop_words_}).most_common(10))}')

elapsed time: 0s
embedding size: 16264
most common stop words (they have low DF): 


### Word2Vec

In [14]:
from gensim.models.word2vec import Word2Vec
train_model = True
text_embedding_label = 'word2vec'

timer.start(0)
if train_model:
    model = Word2Vec(
        tweets_tokenized, sg=1, hs=1, negative=0,
        vector_size=300,
        epochs=10,
        alpha=0.03, 
        # window=8,
        # min_count=10,
        seed=SEED, workers=N_CPU,
    )
else:
    model = Word2Vec.load(model_path(text_embedding_label + '.model'))

# compute mean embedding vector for every tweet
zero_vec = np.zeros(len(model.wv[0]), dtype=np.float32)
embedds_raw = np.array([model.wv.get_mean_vector(ts, pre_normalize=True, post_normalize=True) if len(ts) > 0 else zero_vec for ts in tweets_tokenized])
print(f'elapsed time: {timer.pause(0):.0f}s')

text_embedding_label += str(embedds_raw.shape[1])
print(model)

elapsed time: 70s
Word2Vec<vocab=11620, vector_size=300, alpha=0.03>


In [128]:
model: model.save(model_path(text_embedding_label + '.model'))

### FastText

In [26]:
from gensim.models.fasttext import FastText
text_embedding_label = 'fasttext'

timer.start(0)
model = FastText(
    tweets_tokenized, sg=1, hs=1, negative=0,
    vector_size=100,
    min_n=3,
    alpha=0.03,  # 0.025
    # window=8,
    # min_count=10,
    seed=SEED, workers=N_CPU,
)
zero_vec = np.zeros(len(model.wv[0]), dtype=np.float32)
embedds_raw = np.array([model.wv.get_mean_vector(ts, pre_normalize=True, post_normalize=True) if len(ts) > 0 else zero_vec for ts in tweets_tokenized])
print(f'elapsed time: {timer.pause(0):.0f}s')

text_embedding_label += str(embedds_raw.shape[1])
print(model)

elapsed time: 27s
FastText<vocab=11620, vector_size=100, alpha=0.03>


### Doc2Vec

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
text_embedding_label = 'doc2vec'

timer.start(0)
model = Doc2Vec(
    [TaggedDocument(doc, [i]) for i, doc in enumerate(tweets_tokenized)], dm=1,
    vector_size=10,
    epochs=100,
    alpha=0.03,
    # hs=1, negative=0,
    # window=8,
    # min_count=10,
    seed=SEED, workers=N_CPU,
)
embedds_raw = np.array([model.dv[i] for i in range(n_tweets)])
print(f'elapsed time: {timer.pause(0):.0f}s')

text_embedding_label += str(embedds_raw.shape[1])
print(model)

elapsed time: 686s
Doc2Vec<dm/m,d10,n5,w5,mc5,s0.001,t8>


### Universal Sentence Encoder

In [147]:
import tensorflow_hub as hub 
text_embedding_label = 'use'
model = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

timer.start(0)
embedds_raw = model(tweets).numpy()
print(f'elapsed time: {timer.pause(0):.0f}s')

print(f'embedding size: {embedds_raw.shape[1]}')
norms = np.linalg.norm(embedds_raw, axis=1); print(f'avg/min/max embedding norm: {np.mean(norms):.2f}/{np.min(norms):.2f}/{np.max(norms):.2f}')

elapsed time: 6s
embedding size: 512
avg/min/max embedding norm: 1.00/1.00/1.00


### Distil-RoBERTa

In [25]:
from sentence_transformers import SentenceTransformer
text_embedding_label = 'distilroberta'
model = SentenceTransformer('all-distilroberta-v1') 

timer.start(0)
embedds_raw = model.encode(tweets, show_progress_bar=True)
print(f'elapsed time: {timer.pause(0):.0f}s')

print(f'embedding size: {embedds_raw.shape[1]}')
norms = np.linalg.norm(embedds_raw, axis=1); print(f'avg/min/max embedding norm: {np.mean(norms):.2f}/{np.min(norms):.2f}/{np.max(norms):.2f}')

Downloading (…)".gitattributes";:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)ooling/config.json";:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)"README.md";:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading (…)_transformers.json";:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)"data_config.json";:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading (…)"merges.txt";:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)e_bert_config.json";:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)al_tokens_map.json";:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)"train_script.py";:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)"vocab.json";:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)"modules.json";:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2201 [00:00<?, ?it/s]

elapsed time: 57s
embedding size: 768
avg/min/max embedding norm: 1.00/1.00/1.00


### MPNet

In [10]:
from sentence_transformers import SentenceTransformer
text_embedding_label = 'mpnet'
model = SentenceTransformer('all-mpnet-base-v2') 

timer.start(0)
embedds_raw = model.encode(tweets, show_progress_bar=True)
print(f'elapsed time: {timer.pause(0):.0f}s')

print(f'embedding size: {embedds_raw.shape[1]}')
norms = np.linalg.norm(embedds_raw, axis=1); print(f'avg/min/max embedding norm: {np.mean(norms):.2f}/{np.min(norms):.2f}/{np.max(norms):.2f}')

Downloading (…)".gitattributes";:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)ooling/config.json";:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)"README.md";:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)"config.json";:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)_transformers.json";:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)"data_config.json";:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)e_bert_config.json";:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)al_tokens_map.json";:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)enizer_config.json";:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)"train_script.py";:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)"vocab.txt";:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)"modules.json";:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/2201 [00:00<?, ?it/s]

elapsed time: 110s
embedding size: 768
avg/min/max embedding norm: 1.00/1.00/1.00


# Dimensionality Reduction

### No reduction

In [21]:
reduction_label = ''
timer.set(1, 0)
embedds = embedds_raw if isinstance(embedds_raw, np.ndarray) else embedds_raw.toarray()
print(f'embedding size: {embedds.shape[1]}')

embedding size: 10


### Truncated SVD

In [179]:
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
reduction_label = 'svd'

timer.start(1)
pipe = make_pipeline(
    TruncatedSVD(n_components=10, random_state=SEED), 
    Normalizer(copy=False),
)
embedds = pipe.fit_transform(embedds_raw)
print(f'elapsed time: {timer.pause(1):.0f}s')

reduction_label += str(embedds.shape[1])
print(f'embedding size: {embedds.shape[1]}')
norms = np.linalg.norm(embedds, axis=1); print(f'avg/min/max embedding norm: {np.mean(norms):.2f}/{np.min(norms):.2f}/{np.max(norms):.2f}')
print(f'explained variance of the svd step: {pipe[0].explained_variance_ratio_.sum() * 100:.1f}%')

elapsed time: 1s
embedding size: 10
avg/min/max embedding norm: 1.00/0.00/1.00
explained variance of the svd step: 4.5%


### UMAP

In [14]:
import umap
reduction_label = 'umap'

timer.start(1)
embedds = umap.UMAP(
    n_components=5,
    n_neighbors=15,
    # metric='cosine',  # correlation
    # n_epochs=1000,
    # verbose=True,
).fit_transform(embedds_raw)
print(f'elapsed time: {timer.pause(1):.0f}s')

reduction_label += str(embedds.shape[1])
print(f'embedding size: {embedds.shape[1]}')
norms = np.linalg.norm(embedds, axis=1); print(f'avg/min/max embedding norm: {np.mean(norms):.2f}/{np.min(norms):.2f}/{np.max(norms):.2f}')

elapsed time: 88s
embedding size: 5
avg/min/max embedding norm: 4.25/1.69/7.49


# Clustering

### K-Means

In [24]:
from sklearn.cluster import MiniBatchKMeans
trials = 5
label = get_pipeline_label('kmeans')

timer.start(2)
best_clusters, best_loss = None, None
for i in range(trials):
    model = MiniBatchKMeans(
        n_clusters=n_topics,
        batch_size=2048,  # as a rule of thumb: 256 * number of cores
        random_state=SEED+i,
    )
    clusters = model.fit_predict(embedds)
    if best_loss is None or model.inertia_ < best_loss: best_clusters, best_loss = clusters, model.inertia_
clusters = best_clusters
print(f'elapsed time: {timer.pause(2):.0f}s, total running time: {timer.get_total_time():.0f}s')

print(f'{label} ari: {adjusted_rand_score(clusters_true, clusters)}')
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')

elapsed time: 23s, total running time: 753s
doc2vec10+umap5+kmeans ari: 0.18639334153342788
cluster distribution: [6814 8104 7806 7795 7774 7119 8577 7200 9217]


### CLARA

In [15]:
from sklearn_extra.cluster import CLARA
trials = 5
label = get_pipeline_label('clara')

timer.start(2)
best_clusters, best_loss = None, None
for i in range(trials):
    model = CLARA(
        n_clusters=n_topics,
        n_sampling=300,
        # max_iter=1000,
        # n_sampling_iter=5,
        random_state=SEED+i,
    )
    clusters = model.fit_predict(embedds)
    if best_loss is None or model.inertia_ < best_loss: best_clusters, best_loss = clusters, model.inertia_
clusters = best_clusters
print(f'elapsed time: {timer.pause(2):.0f}s, total running time: {timer.get_total_time():.0f}s')

print(f'{label} ari: {adjusted_rand_score(clusters_true, clusters)}')
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')


elapsed time: 2s, total running time: 91s
tfidf+umap5+clara ari: 0.8093381255331071
cluster distribution: [ 3524  8148  9454  8831  9408  6875  9292  4682 10192]


### HDBSCAN

In [16]:
from hdbscan import flat
label = get_pipeline_label('hdbscan')

timer.start(2)
hdbscan = flat.HDBSCAN_flat(
    embedds, n_topics,
    min_cluster_size=200,
    # min_samples=50,
) 
clusters = hdbscan.labels_
n_noise = np.sum(clusters == -1)
# remove noise assignments by assigning the nearest sample class
noise_mask = clusters == -1 
min_dist_idxs = np.argsort(np.linalg.norm(np.expand_dims(embedds[noise_mask], 1) - np.expand_dims(embedds, 0), axis=-1), axis=-1)
for i, noise_idx in enumerate(np.where(noise_mask)[0]):
    for min_dist_idx in min_dist_idxs[i]:
        if clusters[min_dist_idx] != -1:
            clusters[noise_idx] = clusters[min_dist_idx]
            break
print(f'elapsed time: {timer.pause(2):.0f}s, total running time: {timer.get_total_time():.0f}s')

print(f'{label} ari: {adjusted_rand_score(clusters_true, clusters)}')
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')
print(f'noise samples detected: {n_noise}')
print(f'cluster persistences (how stable it is): {format_list(hdbscan.cluster_persistence_)}')

elapsed time: 110s, total running time: 198s
tfidf+umap5+hdbscan ari: 0.8314262674981449
cluster distribution: [ 4685  7200  8940  9417 10011  9434  8845   581 11293]
noise samples detected: 9842
cluster persistences (how stable it is): ['0.32', '0.24', '0.21', '0.21', '0.17', '0.16', '0.14', '0.12', '0.09']


### Gaussian Mixture Models

In [None]:
from sklearn.mixture import GaussianMixture 
update_random_state = 0
label = get_pipeline_label('gmm')

timer.start(2)
clusters = GaussianMixture(
    n_components=n_topics,
    max_iter=200,
    init_params='k-means++',
    verbose=True,
    random_state=SEED+update_random_state,
).fit_predict(embedds)
print(f'elapsed time: {timer.pause(2):.0f}s, total running time: {timer.get_total_time():.0f}s')

print(f'{label} ari: {adjusted_rand_score(clusters_true, clusters)}')
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')

# Saving

Evaluation (quite expensive)

In [17]:
clusters = match_cluster_labels(clusters_true, clusters)
metrics_ = compute_metrics(clusters_true, clusters, X=embedds)
print(f'cluster distribution: {np.unique(clusters, return_counts=True)[1]}')
metrics_to_df(metrics_, label=label)

cluster distribution: [ 8940  8845  9417 11293  9434  7200  4685 10011   581]


Unnamed: 0,ari,ami,acc,f1,rec,pre,mrec,mpre,ss,vm,h,c
tfidf+umap5+hdbscan,0.831,0.816,0.925,0.917,0.92,0.917,0.825,0.833,0.574,0.816,0.816,0.815


Insert new data

In [19]:
er.insert(label, (*metrics_, timer.get_total_time()), embedds, clusters, condition_fun=lambda row, df: row[0] > df.loc[label, 'ari'])

# er.df.loc[er.df.index.str.startswith('word2vec')].sort_values('ari', ascending=False)
er.df.sort_values('ari', ascending=False)

Unnamed: 0,ari,ami,acc,f1,rec,pre,mrec,mpre,ss,vm,h,c,time
distilroberta+umap5+hdbscan,0.914,0.898,0.963,0.958,0.959,0.958,0.899,0.901,0.775,0.898,0.899,0.898,132.498
distilroberta+umap5+gmm,0.908,0.893,0.958,0.937,0.957,0.925,0.906,0.605,0.762,0.893,0.895,0.89,111.622
word2vec300+umap5+kmeans,0.903,0.885,0.958,0.952,0.951,0.953,0.877,0.886,0.795,0.885,0.885,0.886,133.661
word2vec300+umap5+hdbscan,0.903,0.885,0.957,0.952,0.951,0.953,0.877,0.888,0.793,0.885,0.885,0.885,145.977
word2vec100+umap5+kmeans,0.896,0.877,0.954,0.949,0.944,0.954,0.833,0.921,0.786,0.877,0.876,0.878,103.724
word2vec100+umap5+gmm,0.896,0.877,0.954,0.948,0.944,0.954,0.831,0.922,0.786,0.877,0.876,0.878,104.215
mpnet+umap5+hdbscan,0.893,0.881,0.946,0.852,0.852,0.852,0.003,0.003,0.748,0.881,0.882,0.881,185.922
word2vec50+umap5+kmeans,0.882,0.863,0.948,0.938,0.934,0.944,0.789,0.872,0.759,0.863,0.862,0.863,77.976
mpnet+umap5+kmeans,0.876,0.873,0.926,0.874,0.936,0.877,0.739,0.165,0.748,0.873,0.887,0.86,171.468
mpnet+umap5+clara,0.875,0.873,0.925,0.872,0.935,0.877,0.726,0.16,0.743,0.873,0.887,0.859,165.597


Remove values based on either key or condition

In [None]:
# er.drop(key='fasttext+umap5+kmeans')
er.drop(condition_fun=lambda row: row['ari'] < 0.8)

In [11]:
er.df

Unnamed: 0,ari,ami,acc,f1,rec,pre,mrec,mpre,ss,vm,h,c,time
distilroberta+umap10+clara,0.871,0.875,0.912,0.836,0.826,0.854,0.002,0.0,0.758,0.875,0.887,0.863,114.126
distilroberta+umap10+kmeans,0.862,0.872,0.896,0.826,0.813,0.849,0.0,0.0,0.77,0.872,0.886,0.858,114.287
distilroberta+umap5+clara,0.863,0.872,0.901,0.829,0.817,0.85,0.0,0.0,0.766,0.872,0.885,0.859,111.662
distilroberta+umap5+gmm,0.908,0.893,0.958,0.937,0.957,0.925,0.906,0.605,0.762,0.893,0.895,0.89,111.622
distilroberta+umap5+hdbscan,0.914,0.898,0.963,0.958,0.959,0.958,0.899,0.901,0.775,0.898,0.899,0.898,132.498
distilroberta+umap5+kmeans,0.862,0.872,0.897,0.826,0.813,0.85,0.002,0.0,0.765,0.872,0.886,0.858,110.941
doc2vec100+umap5+kmeans,0.034,0.062,0.235,0.216,0.211,0.242,0.002,0.001,0.238,0.062,0.062,0.062,142.802
fasttext100+umap5+kmeans,0.825,0.828,0.876,0.812,0.796,0.839,0.019,0.002,0.677,0.828,0.843,0.814,79.005
lda,0.237,0.295,0.482,0.438,0.479,0.492,0.11,0.039,,0.296,0.294,0.297,83.631
mpnet+svd50+kmeans,0.835,0.836,0.895,0.828,0.818,0.85,0.072,0.01,0.206,0.836,0.85,0.823,113.398


# Specific Algorithms Analysis

HBDSCAN can help to detect noise tweets. Assignment Probability is a measure of how confident HDBSCAN is of the cluster's assignment

In [53]:
n = 10
indices = np.argsort(hdbscan.probabilities_)[:n]
pd.DataFrame({'text': dataset['text'][indices], 'probability': hdbscan.probabilities_[indices]})

Unnamed: 0,text,probability
3938,This is what happens when you have a fucking baniya become the broadcaster of a major sports event.\n#JioCinema #FIFAWorldCup,0.05
3952,The #Indian advertisement of the world cup doesn't even feature the North East part of India where the MAJORITY of people are football fans.\nStop this kind of discrimination!\n\n#WorldCup #WorldCup2022,0.06
3594,Can you watch World Cup matches without having to put up with a commentary? #FIFAWorldCup,0.07
4205,What sin did we committed that we have to rely on @JioCinema for the World Cup?\n\n#JioCinema,0.07
4312,Wow neither my Comcast nor Miami local channels has the #WorldCup2022 on??? I am now streaming it from Twitter on my unlogged in smart TV web browser!! Picture is awesome!! Thank you @elonmusk,0.11
3840,@JioCinema Can you bloody acknowledge that this is being looked into and will be fixed ffs??????? #WorldCup2022 #JioCinema #scam,0.11
3907,"Just to rub salt in the wounds of anyone gagging for half a lager at the #QATECU game, an advert for @Budweiser comes up on the side of the pitch!😂",0.14
4175,The world's biggest sporting event and @reliancejio doesn't have the infrastructure capacity to broadcast the uninterrupted without any glitches. What a shame for India. I suppose we are too far away from a truly digital experience. #CronyCapitalism #FIFAWorldCup,0.16
4449,Please let me know how to watch #WorldCup2022 for free without #cabletv.,0.17
4961,"While we express our anger and sadness on Twitter, I reflect that the rest of the country doesn’t listen to us and elects these horrible ppl back. So how do we get the message out if Twitter doesn’t help the cause?",0.19


Top outliers

In [51]:
n = 10
indices = np.argsort(hdbscan.outlier_scores_)[::-1][:n]
pd.DataFrame({'text': dataset['text'][indices], 'outlier score': hdbscan.outlier_scores_[indices]})

Unnamed: 0,text,outlier score
3938,This is what happens when you have a fucking baniya become the broadcaster of a major sports event.\n#JioCinema #FIFAWorldCup,0.95
3952,The #Indian advertisement of the world cup doesn't even feature the North East part of India where the MAJORITY of people are football fans.\nStop this kind of discrimination!\n\n#WorldCup #WorldCup2022,0.94
3594,Can you watch World Cup matches without having to put up with a commentary? #FIFAWorldCup,0.93
4205,What sin did we committed that we have to rely on @JioCinema for the World Cup?\n\n#JioCinema,0.93
4312,Wow neither my Comcast nor Miami local channels has the #WorldCup2022 on??? I am now streaming it from Twitter on my unlogged in smart TV web browser!! Picture is awesome!! Thank you @elonmusk,0.89
6419,The following link is information on how consumers can report fraudulent products. This is imperative with the uprising of fake COVID-19 medical products. https://t.co/B2CCWBuAmQ\r\r\n\r\r\nNew Yorkers can contact the Consumer Complaint Coordinator at 866-446-9055 (toll-free).,0.89
3840,@JioCinema Can you bloody acknowledge that this is being looked into and will be fixed ffs??????? #WorldCup2022 #JioCinema #scam,0.89
6196,Many are facing financial uncertainty at this time. The Consumer Financial Protection Bureau site lists key resources and steps to help protect yourself financially from the impact of COVID-19. https://t.co/9ZeF36sW5v https://t.co/erzNZedkJF,0.89
6262,We welcome today s proposed measures from to support consumer credit customers through the outbreak Read our full response,0.88
6502,".@NCLC4consumers has been doing a great job putting together a list of all the major consumer protections, and their publication ""Surviving Debt"" is now online for free. You can find everything at the link, but wanted to highlight some major ones: https://t.co/1whYfIal70",0.88
