# Sparse reproduction of clustering method

In [44]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict
from ebbe import distinct, grouped

In [2]:
%load_ext Cython

## Constants and helpers

In [3]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [4]:
with open('../data/event2018.tsv') as f:
    TWEETS = sorted(
        distinct(csv.DictReader(f, delimiter='\t'), key=itemgetter('id')),
        key=itemgetter('id')
    )

In [5]:
# Adding dates & parsing stuff
for i, tweet in enumerate(TWEETS):
    tweet['index'] = i
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['timestamp'] = tweet['date'].timestamp()
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [6]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
index 0
date 2018-07-16 05:00:56
timestamp 1531710056.0


In [7]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [8]:
TRUTH = defaultdict(list)

for tweet in TWEETS:
    if tweet['label'] is None:
        continue
        
    TRUTH[tweet['label']].append(tweet['id'])

TRUTH = list(TRUTH.values())

In [111]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Number of non-singleton clusters:', sum(l > 1 for l in lens))
    print('Max number of tweets in clusters:', max(lens))
    print('Min number of tweets in clusters:', min(lens))
    print('Mean number of tweets in clusters:', mean(lens))
    print('Median number of tweets in clusters:', median_low(lens))
    print('Median number of tweets in non-singleton clusters', median_low(l for l in lens if l > 1))

In [10]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [11]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR + [t + "'" for t in STOP_WORDS_FR] + [t + "’" for t in STOP_WORDS_FR],
    reduce_words=True,
    decode_html_entities=True
)

In [12]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text'])
    print(list(tokenizer(tweet['text'])))
    print()

MISE A JOUR : samedi 12:00 🔵FIN DE TOUTES LES #VIGILANCES JAUNES ET DE L'ÉPISODE DE #POLLUTION DANS L'#OISE 🔵  👉Toutes les mesures, notamment de réduction de limitation de vitesse sur autoroutes, sont levées. Le département n'est plus en vigilance jaune #canicule et #orages. https://t.co/5bJXSwNtXg
[('word', 'mise'), ('word', 'samedi'), ('word', 'fin'), ('word', 'vigilances'), ('word', 'jaunes'), ('word', 'episode'), ('word', 'pollution'), ('word', 'oise'), ('word', 'mesures'), ('word', 'notamment'), ('word', 'reduction'), ('word', 'limitation'), ('word', 'vitesse'), ('word', 'autoroutes'), ('word', 'levees'), ('word', 'departement'), ('word', 'vigilance'), ('word', 'jaune'), ('word', 'canicule'), ('word', 'orages')]

Ils se foutent bien de votre gueule dans cette vidéo ! C'est qui le con maintenant ? Pas le pauvre ado que vous avez sermoné le 18 juin. Je ne mentionne pas non plus les 250 000 euros que les contribuables français ont dû payer pour la piscine de votre villa à Nice
[('wor

In [13]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = set(token for _, token in tokenizer(tweet['text']))
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [14]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 84083


In [15]:
N = len(TWEETS)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    if df < 10:
        continue
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = 1 + math.log((N + 1) / (df + 1))

In [16]:
print('Size of vocabulary after df trimming:', len(INVERSE_DOCUMENT_FREQUENCIES))

Size of vocabulary after df trimming: 14225


In [17]:
VECTORS: List[Dict[int, float]] = []

for i, tweet in tqdm(enumerate(TWEETS), total=len(TWEETS)):
    vector = {}

    for token in tweet['tokens']:
        idf = INVERSE_DOCUMENT_FREQUENCIES.get(token)
        
        if idf is None:
            continue
        
        vector[TOKEN_IDS[token]] = idf
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)
    TWEETS[i]['vector'] = vector

  0%|          | 0/137757 [00:00<?, ?it/s]

In [18]:
VECTORS[254]

{1053: 0.2819053429277208,
 614: 0.22127831177409393,
 1054: 0.2778883710559953,
 1055: 0.3270360552825462,
 1056: 0.34877074198506636,
 535: 0.19202726765211034,
 1057: 0.2771703452182757,
 1058: 0.3420290070110478,
 381: 0.2766408301732054,
 1059: 0.30280096548883645,
 1060: 0.26412400970698136,
 314: 0.3115893902695856}

In [19]:
sum(bool(v) for v in VECTORS) / len(VECTORS)

0.9971253729393061

## Clustering

In [20]:
from fog.clustering import pairwise_connected_components
from fog.metrics import sparse_dot_product

In [None]:
it = pairwise_connected_components(
    enumerate(VECTORS),
    key=itemgetter(1),
    similarity=sparse_dot_product,
    radius=0.5,
    processes=8
)
CLUSTERS = list(it)

In [38]:
CLUSTERS_WITH_IDS = [[TWEETS[i]['id'] for i, _ in cluster] for cluster in CLUSTERS]

In [23]:
def clustering(tweets, threshold):
    index = defaultdict(list)
    
    for i, t in enumerate(tweets):
        for dim in t['vector'].keys():
            index[dim].append(i)

    for t1 in tqdm(tweets):
        candidates = set()
        
        upper_bound = t1['date'] + ONE_DAY
        lower_bound = t1['date'] - ONE_DAY
        
        for dim in t1['vector'].keys():
            for j in index[dim]:
                if i == j:
                    continue
                candidates.add(j)

        for j in candidates:
            t2 = tweets[j]
            
            if t2['date'] > upper_bound or t2['date'] < lower_bound:
                continue
            
            cosine = 1.0 - sparse_dot_product(t1['vector'], t2['vector'])
            
            if cosine < threshold:
                yield (t1, t2, cosine)

In [None]:
EDGES = []
for edge in clustering(TWEETS, 1.0):
    EDGES.append(edge)

  0%|          | 0/137757 [00:00<?, ?it/s]

In [211]:
from heapq import nsmallest
NN = []

for t, neighbors in grouped(EDGES, key=lambda x: x[0]['id']).items():
    nn = nsmallest(4, neighbors, key=itemgetter(2))
    # argmin = min(neighbors, key=itemgetter(2))
    NN.extend(nn)

In [126]:
def components(tweets, edges, threshold=None):
    graph = nx.Graph()
    
    for t in tweets:
        graph.add_node(t['id'])
        
    for t1, t2, sim in edges:
        if threshold is not None and sim > threshold:
            continue
        
        graph.add_edge(t1['id'], t2['id'], weight=sim)

    return list(nx.connected_components(graph))

In [212]:
CLUSTERS = components(TWEETS, NN, 0.9)

## Evaluation

In [114]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Number of non-singleton clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


In [202]:
print_cluster_stats(CLUSTERS)

Number of clusters: 527
Number of non-singleton clusters: 90
Max number of tweets in clusters: 136428
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 261.3984819734345
Median number of tweets in clusters: 1
Median number of tweets in non-singleton clusters 8


In [213]:
best_matching(TRUTH, CLUSTERS, allow_additional_items=True)

(0.9973974150368606, 0.045342924640892, 0.06421678454179315)

## Sanity tests

In [83]:
TWEETS_INDEX = {t['id']: t for t in TWEETS}

In [75]:
for truth_cluster in TRUTH:
    cluster = [VECTORS[TWEETS_INDEX[_id]['index']] for _id in truth_cluster]
    
    if len(cluster) > 500:
        cluster = sample(cluster, 500)
    
    sims = []
    
    for i in range(len(cluster)):
        for j in range(i + 1, len(cluster)):
            cosine = sparse_dot_product(cluster[i], cluster[j])
            sims.append(cosine)
                
    print(len(cluster), mean(sims), median_low(sims), max(sims), min(sims))

338 0.2482923207688562 0.22311742135519452 0.9999999999999998 0.0
498 0.14553582934925588 0.1178202371680357 1.0000000000000002 0.0
348 0.08121931789241378 0.050033895394794775 1.0000000000000002 0.0
46 0.20173756479342148 0.19212502544914917 0.8350795372708426 0.0
96 0.20178262439774716 0.159056006987777 0.91723619507997 0.0
27 0.31859335499024116 0.27037391681640144 0.9060832997705063 0.06487108314069774
66 0.23090455833899923 0.20066249276015782 0.9083864120872838 0.0
113 0.22697600417509983 0.16028052184324693 0.9999999999999999 0.0
87 0.1688540882947266 0.16612080055586717 0.7779136091362886 0.0
256 0.06820298711254288 0.03413953680591835 0.8234710461889135 0.0
500 0.3656163953762494 0.42649208288714197 1.0 0.0
33 0.1936048974030048 0.14520766113768058 1.0000000000000002 0.0
500 0.12494262801085648 0.11908082643968844 0.9999999999999998 0.0
37 0.23005634197114913 0.18792678478351627 0.7686010483006819 0.0
37 0.46812192731724994 0.4628327749491855 0.8278337840345131 0.1627263567567

In [138]:
VECTORS[0]

{0: 0.3724233412992261,
 1: 0.30285878002097777,
 2: 0.26504828453270446,
 3: 0.36502759751995767,
 4: 0.48437121633298197,
 5: 0.4393103267425311,
 6: 0.37211911498911404}