# Sparse reproduction of clustering method

In [54]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching, clusters_to_labels
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict
from ebbe import distinct, partitioned_items
from sklearn.metrics import fowlkes_mallows_score, v_measure_score, homogeneity_score, completeness_score

In [2]:
%load_ext Cython

## Constants and helpers

In [3]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [4]:
with open('../data/event2018.tsv') as f:
    TWEETS = sorted(
        distinct(csv.DictReader(f, delimiter='\t'), key=itemgetter('id')),
        key=itemgetter('id')
    )

In [5]:
# Adding dates & parsing stuff
for i, tweet in enumerate(TWEETS):
    tweet['index'] = i
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['timestamp'] = tweet['date'].timestamp()
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [30]:
TWEETS_INDEX = {t['id']: t for t in TWEETS}

In [6]:
days = Counter(t['date'].isoformat()[:10] for t in TWEETS)
mean(days.values())

6261.681818181818

In [45]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
index 0
date 2018-07-16 05:00:56
timestamp 1531710056.0
tokens {'prison', 'beghal', 'sortie', 'rennes', 'exclusive', 'djamel', 'video'}
vector {0: 0.30285878002097777, 1: 0.37211911498911404, 2: 0.36502759751995767, 3: 0.4393103267425311, 4: 0.48437121633298197, 5: 0.3724233412992261, 6: 0.26504828453270446}


In [8]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [9]:
TRUTH = partitioned_items((t['label'], t['id']) for t in TWEETS if t['label'] is not None)

In [10]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Number of non-singleton clusters:', sum(l > 1 for l in lens))
    print('Max number of tweets in clusters:', max(lens))
    print('Min number of tweets in clusters:', min(lens))
    print('Mean number of tweets in clusters:', mean(lens))
    print('Median number of tweets in clusters:', median_low(lens))
    print('Median number of tweets in non-singleton clusters', median_low(l for l in lens if l > 1))

In [11]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Number of non-singleton clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [12]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR + [t + "'" for t in STOP_WORDS_FR] + [t + "’" for t in STOP_WORDS_FR],
    reduce_words=True,
    decode_html_entities=True
)

In [13]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text'])
    print(list(tokenizer(tweet['text'])))
    print()

#PoseTonGaulois #MandelaDay #Mandela100 #AIF2018 #lesnapoleons Quels emoji etes-vous? https://t.co/9uNNZxfxzS
[('word', 'pose'), ('word', 'gaulois'), ('word', 'mandela'), ('word', 'day'), ('word', 'mandela'), ('word', 'aif'), ('word', 'lesnapoleons'), ('word', 'quels'), ('word', 'emoji')]

allez faites peter vos plus belles photos de #LunarEclipse pour ceux qui ont pas eu la chance de la voir
[('word', 'allez'), ('word', 'peter'), ('word', 'belles'), ('word', 'photos'), ('word', 'lunar'), ('word', 'eclipse'), ('word', 'chance')]

Il s’est aperçu que Abdegrosnour était carrément à chier je pense... on ne pourra malheureusement jamais le vendre... faites qu’un club fasse une proposition !! #TeamOM #SCPOM
[('word', 'apercu'), ('word', 'abdegrosnour'), ('word', 'carrement'), ('word', 'chier'), ('word', 'pourra'), ('word', 'malheureusement'), ('word', 'vendre'), ('word', 'club'), ('word', 'fasse'), ('word', 'proposition'), ('word', 'team'), ('word', 'om'), ('word', 'scpom')]

Suite au tweet

In [14]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = set(token for _, token in tokenizer(tweet['text']))
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [15]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 84217


In [16]:
N = len(TWEETS)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    if df < 10:
        continue
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = 1 + math.log((N + 1) / (df + 1))

In [17]:
print('Size of vocabulary after df trimming:', len(INVERSE_DOCUMENT_FREQUENCIES))

Size of vocabulary after df trimming: 14221


In [18]:
VECTORS: List[Dict[int, float]] = []

for i, tweet in tqdm(enumerate(TWEETS), total=len(TWEETS)):
    vector = {}

    for token in tweet['tokens']:
        idf = INVERSE_DOCUMENT_FREQUENCIES.get(token)
        
        if idf is None:
            continue
        
        vector[TOKEN_IDS[token]] = idf
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)
    TWEETS[i]['vector'] = vector

  0%|          | 0/137757 [00:00<?, ?it/s]

In [103]:
VECTORS[254]

{1053: 0.3420290070110478,
 1054: 0.3270360552825462,
 534: 0.19202726765211034,
 1055: 0.26412400970698136,
 314: 0.3115893902695856,
 1056: 0.30280096548883645,
 1057: 0.34877074198506636,
 1058: 0.2771703452182757,
 606: 0.22127831177409393,
 1059: 0.2819053429277208,
 1060: 0.2778883710559953,
 375: 0.2766408301732054}

In [20]:
sum(bool(v) for v in VECTORS) / len(VECTORS)

0.997118113780062

## Clustering

In [81]:
%%cython
from collections import deque, defaultdict

def dot_product(A: dict, B: dict):
    
    # Swapping so we iterate over the smallest set
    if len(A) > len(B):
        A, B = B, A

    cdef float product = 0.0

    for k, w1 in A.items():
        w2 = B.get(k)

        if w2 is not None:
            product += w1 * w2

    return 1.0 - product

def clustering(tweets, threshold=0.7):
    best_candidate = None
    cdef float best_distance
    cdef int w = 6262
    cdef int thread_id = -1
    
    threads = {}
    
    T = deque()
    I = defaultdict(deque)
    
    for t1 in tweets:
        
        best_candidate = None
        best_distance = 2.0
        
        C = set()
        
        for dim in t1['vector'].keys():
            for c in I[dim]:
                C.add(c)
            I[dim].append(t1['index'])
        
        for c in C:
            t2 = tweets[c]
            d = dot_product(t1['vector'], t2['vector'])
            
            if d > threshold:
                continue
            
            if d < best_distance:
                best_distance = d
                best_candidate = t2

        if best_candidate is not None:
            threads[t1['index']] = threads[best_candidate['index']]
        else:
            thread_id += 1
            threads[t1['index']] = thread_id
            
        yield (t1['index'], threads[t1['index']])

        T.append(t1)

        if len(T) > w:
            t3 = T.popleft()
            
            for dim in t3['vector'].keys():
                I[dim].popleft()

In [93]:
threads = []

for i, thread_id in tqdm(clustering(TWEETS, 0.7), total=len(TWEETS)):
    threads.append((i, thread_id))

  0%|          | 0/137757 [00:00<?, ?it/s]

In [94]:
CLUSTERS = partitioned_items((thread_id, TWEETS[i]['id']) for i, thread_id in threads)

## Evaluation

In [95]:
truth_ids = set()
truth_labels = {}
truth_order = {}

c = 0
for i, cluster in enumerate(TRUTH):
    for _id in cluster:
        truth_ids.add(_id)
        truth_labels[_id] = i
        truth_order[_id] = c
        c += 1

predicted_labels = {}
for i, cluster in enumerate(CLUSTERS):
    for _id in cluster:
        if _id not in truth_labels:
            continue
            
        predicted_labels[_id] = i

truth_labels = [v for _, v in sorted(truth_labels.items(), key=lambda t: truth_order[t[0]])]
predicted_labels = [v for _, v in sorted(predicted_labels.items(), key=lambda t: truth_order[t[0]])]

In [88]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Number of non-singleton clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


In [96]:
print_cluster_stats(CLUSTERS)

Number of clusters: 29324
Number of non-singleton clusters: 4844
Max number of tweets in clusters: 3482
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 4.697756104214977
Median number of tweets in clusters: 1
Median number of tweets in non-singleton clusters 3


In [97]:
# Best matching macro average
best_matching(TRUTH, CLUSTERS, allow_additional_items=True)

(0.7999131481454262, 0.7044670083171446, 0.6860234200601311)

In [98]:
# Best matching micro average
best_matching(TRUTH, CLUSTERS, allow_additional_items=True, micro=True)

(0.4515286039188244, 0.43107227859200803, 0.441063379830389)

In [99]:
# Fowlkes-Mallows score
fowlkes_mallows_score(truth_labels, predicted_labels)

0.23379028402172444

In [100]:
# Homogeneity score
homogeneity_score(truth_labels, predicted_labels)

0.892807009260605

In [101]:
# Completeness score
completeness_score(truth_labels, predicted_labels)

0.545779616570369

In [102]:
# v-measure
v_measure_score(truth_labels, predicted_labels)

0.6774369487887107