# Sparse reproduction of clustering method

In [1]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict

In [2]:
%load_ext Cython

## Constants and helpers

In [3]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [4]:
with open('../data/event2018.tsv') as f:
    ALL_TWEETS = list(csv.DictReader(f, delimiter='\t'))
    
# Keeping tweets only once (to avoid fuzzy clusters present in the data)
already_seen = set()
TWEETS = []

for tweet in ALL_TWEETS:
    if tweet['id'] in already_seen:
        continue
    
    already_seen.add(tweet['id'])
    TWEETS.append(tweet)

In [5]:
# Adding dates & parsing stuff
for tweet in TWEETS:
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['timestamp'] = tweet['date'].timestamp()
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [6]:
# Making suuuuuuuure the tweets are sorted by date
TWEETS = sorted(TWEETS, key=itemgetter('date'))

In [7]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
date 2018-07-16 05:00:56
timestamp 1531710056.0


In [8]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [9]:
TRUTH = defaultdict(list)

for tweet in TWEETS:
    if tweet['label'] is None:
        continue
        
    TRUTH[tweet['label']].append(tweet['id'])

TRUTH = list(TRUTH.values())

In [10]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Max number of tweets in clusters:', max(lens))
    print('Min number of tweets in clusters:', min(lens))
    print('Mean number of tweets in clusters:', mean(lens))
    print('Median number of tweets in clusters:', median_low(lens))

In [11]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [29]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR + [t + "'" for t in STOP_WORDS_FR],
    reduce_words=True,
    decode_html_entities=True
)

In [13]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text'])
    print(list(tokenizer(tweet['text'])))
    print()

vos fils de pute de flics de france pense m'avoir baiser le cul en me menacant de garde à vue ..... vous allez voir de quel bois je me chauffe tout bientot bandes de frouzes de merde
[('word', 'fils'), ('word', 'pute'), ('word', 'flics'), ('word', 'france'), ('word', 'baiser'), ('word', 'cul'), ('word', 'menacant'), ('word', 'garde'), ('word', 'vue'), ('word', 'allez'), ('word', 'bois'), ('word', 'chauffe'), ('word', 'bientot'), ('word', 'bandes'), ('word', 'frouzes')]

streamers de que man? de creative destructions atr? xd
[('word', 'streamers'), ('word', 'man'), ('word', 'creative'), ('word', 'destructions'), ('word', 'atr')]

Syndicats et patronat vigilants face à Macron. "Je pense que le président a bien reçu ce message et devrait donner des consignes en la matière", a estimé François @fhommeril @CFECGC  https://t.co/AiBUnxtokI https://t.co/1Zswn0UF8p
[('word', 'Syndicats'), ('word', 'patronat'), ('word', 'vigilants'), ('word', 'face'), ('word', 'Macron'), ('word', 'Je'), ('word', 

In [14]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = set(token for _, token in tokenizer(tweet['text']))
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [15]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 108207


In [16]:
N = len(DOCUMENT_FREQUENCIES)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    if df < 10:
        continue
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = 1 + math.log((N + 1) / (df + 1))

In [17]:
print('Size of vocabulary after df trimming:', len(INVERSE_DOCUMENT_FREQUENCIES))

Size of vocabulary after df trimming: 16297


In [18]:
VECTORS: List[Dict[int, float]] = []

for tweet in tqdm(TWEETS):
    vector = {}

    for token in tweet['tokens']:
        idf = INVERSE_DOCUMENT_FREQUENCIES.get(token)
        
        if idf is None:
            continue
        
        vector[TOKEN_IDS[token]] = INVERSE_DOCUMENT_FREQUENCIES[token]
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)

  0%|          | 0/137757 [00:00<?, ?it/s]

In [19]:
VECTORS[254]

{1248: 0.26918549228293104,
 1249: 0.270466630540033,
 1250: 0.2561301581725042,
 1251: 0.3202004743619945,
 711: 0.2137285740646607,
 1252: 0.3418403035460835,
 134: 0.12313308227424535,
 1253: 0.34448841616785586,
 1254: 0.2754416237831299,
 476: 0.2774097575331397,
 652: 0.22291535683130764,
 1255: 0.3007046170218344,
 373: 0.31136548424777866}

In [20]:
sum(1 if v else 0 for v in VECTORS) / len(VECTORS)

0.9985409089919206

## Clustering

In [21]:
%%cython
import cython
from datetime import timedelta

@cython.boundscheck(False)
@cython.wraparound(False)
def sparse_dot_product(A: dict, B: dict):
    
    # Swapping so we iterate over the smallest set
    if len(A) > len(B):
        A, B = B, A

    cdef float product  = 0.0

    for k, w1 in A.items():
        w2 = B.get(k)

        if w2 is not None:
            product += w1 * w2

    return product

@cython.boundscheck(False)
@cython.wraparound(False)
def clustering(vectors: list, tweets: list):
    cdef float best_metric
    cdef int best_candidate
    cdef float d
    
    cdef float one_day = timedelta(days=1).total_seconds()
    cdef float date_bound
    
    for i, A in enumerate(vectors):
        if not A:
            continue
        
        tweet_a = tweets[i]

        date_bound = tweet_a['timestamp'] + one_day
        best_metric = -1.0
        best_candidate = -1
        d = 0.0

        for j in range(i + 1, len(vectors)):
            tweet_b = tweets[j]

            if tweet_b['timestamp'] > date_bound:
                break

            B = vectors[j]
            
            if not B:
                continue

            d = sparse_dot_product(A, B)

            if d == 0.0:
                continue

            if best_candidate < 0 or d > best_metric:
                best_metric = d
                best_candidate = j
        
        if best_candidate != -1:
            yield (i, best_candidate, best_metric)

In [22]:
EDGES = []

for edge in tqdm(clustering(VECTORS, TWEETS), unit='tweet', total=len(VECTORS)):
    EDGES.append(edge)

  0%|          | 0/137757 [00:00<?, ?tweet/s]

In [23]:
similar_pair = choice(EDGES)
first_tweet = TWEETS[similar_pair[0]]
second_tweet = TWEETS[similar_pair[1]]
print('Similar tweets (similarity: %f):\n' % similar_pair[2])
print(first_tweet['id'], '-', first_tweet['text'])
print('---')
print(second_tweet['id'], '-', second_tweet['text'])

Similar tweets (similarity: 0.353326):

1022388927439822849 - 🔴🇫🇷Et boum 💥 Alexis #Kohler se cogne de la transparence de la vie publique 😂 #Ouille #NouveauMonde - AlloOoooO @bayrou #Benalla
---
1022392749306376192 - "Alexis Kohler"il est bégue des fois...😂😂😂😂😂😂..lui non plus ne sait pas..


In [24]:
def components(t):
    graph = nx.Graph()

    for i, j, sim in EDGES:
        i_id = TWEETS[i]['id']
        j_id = TWEETS[j]['id']

        graph.add_node(i_id)
        graph.add_node(j_id)

        if sim < t:
            continue

        graph.add_edge(i_id, j_id)

    return list(nx.connected_components(graph))

## Evaluation

In [25]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76


In [28]:
for t in (0.01, 0.05, 0.1, 0.15, 0.2, 0.7, 0.8):
    print('t =', t)
    clusters = components(t)
    print_cluster_stats(clusters)
    print(best_matching(TRUTH, clusters, allow_additional_items=True))
    print()

t = 0.01
Number of clusters: 72
Max number of tweets in clusters: 126106
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 1909.8472222222222
Median number of tweets in clusters: 5
(0.7107764725114613, 0.11542567606886275, 0.09902370757178032)

t = 0.05
Number of clusters: 76
Max number of tweets in clusters: 125653
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 1809.328947368421
Median number of tweets in clusters: 4
(0.71389557945334, 0.11034301385644733, 0.09542584218889595)

t = 0.1
Number of clusters: 115
Max number of tweets in clusters: 107880
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 1195.7304347826087
Median number of tweets in clusters: 6
(0.7152188167886431, 0.18150114976751108, 0.15053642812963527)

t = 0.15
Number of clusters: 1244
Max number of tweets in clusters: 57828
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 110.53778135048232
Median number of tweets in clusters: 1
(0