# Sparse reproduction of clustering method

In [8]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict
from ebbe import distinct, partitioned_items

In [2]:
%load_ext Cython

## Constants and helpers

In [3]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [4]:
with open('../data/event2018.tsv') as f:
    TWEETS = sorted(
        distinct(csv.DictReader(f, delimiter='\t'), key=itemgetter('id')),
        key=itemgetter('id')
    )

In [5]:
# Adding dates & parsing stuff
for i, tweet in enumerate(TWEETS):
    tweet['index'] = i
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['timestamp'] = tweet['date'].timestamp()
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [50]:
days = Counter(t['date'].isoformat()[:10] for t in TWEETS)
mean(days.values())

6261.681818181818

In [6]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
index 0
date 2018-07-16 05:00:56
timestamp 1531710056.0


In [7]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [9]:
TRUTH = partitioned_items((t['label'], t['id']) for t in TWEETS if t['label'] is not None)

In [11]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Number of non-singleton clusters:', sum(l > 1 for l in lens))
    print('Max number of tweets in clusters:', max(lens))
    print('Min number of tweets in clusters:', min(lens))
    print('Mean number of tweets in clusters:', mean(lens))
    print('Median number of tweets in clusters:', median_low(lens))
    print('Median number of tweets in non-singleton clusters', median_low(l for l in lens if l > 1))

In [12]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Number of non-singleton clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [13]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR + [t + "'" for t in STOP_WORDS_FR] + [t + "’" for t in STOP_WORDS_FR],
    reduce_words=True,
    decode_html_entities=True
)

In [14]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text'])
    print(list(tokenizer(tweet['text'])))
    print()

Syrie : une mission humanitaire pour réchauffer les relations franco-russes https://t.co/LypUJ2bg4w via @FRANCE24
[('word', 'syrie'), ('word', 'mission'), ('word', 'humanitaire'), ('word', 'rechauffer'), ('word', 'relations'), ('word', 'franco-russes')]

Nantes. Quelques incidents hier soir : certains ont fini la nuit en garde à vue https://t.co/uHDVWWpw09 (PresseOcéan) https://t.co/ou66FbErDu
[('word', 'nantes'), ('word', 'incidents'), ('word', 'hier'), ('word', 'certains'), ('word', 'fini'), ('word', 'nuit'), ('word', 'garde'), ('word', 'vue'), ('word', 'presseocean')]

Schiappa et la défense des droits des femmes à géométrie variable.... https://t.co/vUD7a5h8PY
[('word', 'schiappa'), ('word', 'defense'), ('word', 'droits'), ('word', 'femmes'), ('word', 'geometrie'), ('word', 'variable')]

Félix Auger-Aliassime se mesurera à Pouille au 1er tour; Murray déclare forfait à Toronto. @CoupeRogers  https://t.co/e3JWNvUWMM
[('word', 'felix'), ('word', 'auger-aliassime'), ('word', 'mesurera'

In [15]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = set(token for _, token in tokenizer(tweet['text']))
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [16]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 84083


In [17]:
N = len(TWEETS)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    if df < 10:
        continue
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = 1 + math.log((N + 1) / (df + 1))

In [18]:
print('Size of vocabulary after df trimming:', len(INVERSE_DOCUMENT_FREQUENCIES))

Size of vocabulary after df trimming: 14225


In [19]:
VECTORS: List[Dict[int, float]] = []

for i, tweet in tqdm(enumerate(TWEETS), total=len(TWEETS)):
    vector = {}

    for token in tweet['tokens']:
        idf = INVERSE_DOCUMENT_FREQUENCIES.get(token)
        
        if idf is None:
            continue
        
        vector[TOKEN_IDS[token]] = idf
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)
    TWEETS[i]['vector'] = vector

  0%|          | 0/137757 [00:00<?, ?it/s]

In [22]:
VECTORS[254]

{316: 0.31158939026958554,
 1053: 0.30280096548883645,
 1054: 0.26412400970698136,
 1055: 0.34877074198506636,
 1056: 0.2778883710559953,
 546: 0.1920272676521103,
 1057: 0.28190534292772074,
 611: 0.2212783117740939,
 1058: 0.34202900701104777,
 1059: 0.27717034521827566,
 376: 0.27664083017320534,
 1060: 0.32703605528254615}

In [23]:
sum(bool(v) for v in VECTORS) / len(VECTORS)

0.9971253729393061

## Clustering

In [54]:
%%cython
from collections import deque

def dot_product(A: dict, B: dict):
    
    # Swapping so we iterate over the smallest set
    if len(A) > len(B):
        A, B = B, A

    cdef float product = 0.0

    for k, w1 in A.items():
        w2 = B.get(k)

        if w2 is not None:
            product += w1 * w2

    return 1.0 - product

def clustering(tweets, threshold=0.7):
    best_candidate = None
    cdef float best_distance
    cdef int w = 6262
    cdef int thread_id = -1
    
    threads = {}
    
    T = deque()
    
    for t1 in tweets:
        
        best_candidate = None
        best_distance = 2.0
        
        for t2 in T:
            d = dot_product(t1['vector'], t2['vector'])
            
            if d > threshold:
                continue
            
            if d < best_distance:
                best_distance = d
                best_candidate = t2
                
        if best_candidate is not None:
            threads[t1['index']] = threads[best_candidate['index']]
        else:
            thread_id += 1
            threads[t1['index']] = thread_id
            
        yield (t1['index'], threads[t1['index']])
            
        T.append(t1)
        
        if len(T) > w:
            T.popleft()

In [117]:
threads = []

for i, thread_id in tqdm(clustering(TWEETS, 0.84), total=len(TWEETS)):
    threads.append((i, thread_id))

  0%|          | 0/137757 [00:00<?, ?it/s]

In [129]:
for i in partitioned_items((t_id, i) for i, t_id in threads_085)[0]:
    print(TWEETS[i]['text'])
    print()

#Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren

Le terroriste Djamel Beghal  [mentor de Kouachi et Coulibaly, auteurs des attentats de Charlie Hebdo et de l'Hyper Cacher] sort de prison 👉 Décision intolérable!  À quant une justice d'exception pour ces terroristes ? https://t.co/DJHGpv8Uui via @Le_Figaro

Le terroriste Djamel Beghal sort de prison. https://t.co/HR7HJS6W0r

Djamel Beghal, mentor des djihadistes de Charlie Hebdo, sort de prison ce lundi https://t.co/6FOgfdAktB https://t.co/ZrEzsSk93D

Djamel Beghal, mentor des djihadistes de Charlie Hebdo, sort de prison ce lundi - Ouest-France https://t.co/uHqiGzIseg

Rennes.  La sortie de prison de Djamel Beghal [Vidéo exclusive] https://t.co/cT6i0xq2va via @LeTelegramme

2 ans de remise de peine. RT @Le_Figaro: Le terroriste Djamel Beghal sort de prison  #Société https://t.co/G5vtT5YP0J

L’islamiste Djamel Beghal sort de prison, des incertitudes demeurent sur son sort https://

In [68]:
threads_07 = list(threads)

In [73]:
threads_075 = list(threads)

In [78]:
threads_08 = list(threads)

In [93]:
threads_083 = list(threads)

In [120]:
threads_084 = list(threads)

In [88]:
threads_085 = list(threads)

In [102]:
threads_087 = list(threads)

In [83]:
threads_09 = list(threads)

In [135]:
CLUSTERS = partitioned_items((thread_id, TWEETS[i]['id']) for i, thread_id in threads_085)

## Evaluation

In [114]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Number of non-singleton clusters: 257
Max number of tweets in clusters: 18020
Min number of tweets in clusters: 2
Mean number of tweets in clusters: 372.74708171206225
Median number of tweets in clusters: 76
Median number of tweets in non-singleton clusters 76


In [136]:
print_cluster_stats(CLUSTERS)

Number of clusters: 1120
Number of non-singleton clusters: 258
Max number of tweets in clusters: 34773
Min number of tweets in clusters: 1
Mean number of tweets in clusters: 122.99732142857142
Median number of tweets in clusters: 1
Median number of tweets in non-singleton clusters 4


In [140]:
best_matching(TRUTH, CLUSTERS, allow_additional_items=True, macro=False)

(0.342529959497265, 0.019441560965436172, 0.03679470186793692)