# Sparse reproduction of clustering method

In [1]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict

## Constants and helpers

In [2]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [3]:
with open('../data/event2018.tsv') as f:
    ALL_TWEETS = list(csv.DictReader(f, delimiter='\t'))
    
# Keeping tweets only once (to avoid fuzzy clusters present in the data)
already_seen = set()
TWEETS = []

for tweet in ALL_TWEETS:
    if tweet['id'] in already_seen:
        continue
    
    already_seen.add(tweet['id'])
    TWEETS.append(tweet)

In [4]:
# Adding dates & parsing stuff
for tweet in TWEETS:
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [5]:
# Making suuuuuuuure the tweets are sorted by date
TWEETS = sorted(TWEETS, key=itemgetter('date'))

In [6]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
date 2018-07-16 05:00:56


In [7]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [8]:
TRUTH = defaultdict(list)

for tweet in TWEETS:
    if tweet['label'] is None:
        continue
        
    TRUTH[tweet['label']].append(tweet['id'])

TRUTH = list(TRUTH.values())

In [9]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Max number of tweets in labeled clusters:', max(lens))
    print('Min number of tweets in labeled clusters:', min(lens))
    print('Mean number of tweets in labeled clusters:', mean(lens))
    print('Median number of tweets in labeled clusters:', median_low(lens))

In [10]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in labeled clusters: 18020
Min number of tweets in labeled clusters: 2
Mean number of tweets in labeled clusters: 372.74708171206225
Median number of tweets in labeled clusters: 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [11]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR,
    reduce_words=True,
    decode_html_entities=True,
    min_word_length=3
)

In [12]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text+quote+reply'])
    print(list(tokenizer(tweet['text+quote+reply'])))
    print()

Et bien sur je n’oublie pas #NgoloKante #AdilRami #HugoLloris #thomasLemar #stevenNzonzi  et le bosse #didierdeschamps ainsi que tout le staff @equipedefrance  @SteveMandanda @FlorianThauvin @AntoGriezmann @benmendy23 @raphaelvarane @samumtiti @_OlivierGiroud_ @BenPavard28 @paulpogba @LucasHernandez @NabilFekir @kimpembe_3 @CorentinTolisso @MATUIDIBlaise @KMbappe @DjibrilSidibeS3 @Dembouz @AreolaOfficiel  merci pour tout ce bonheur !
[('word', 'oublie'), ('word', 'ngolo'), ('word', 'kante'), ('word', 'adil'), ('word', 'rami'), ('word', 'hugo'), ('word', 'lloris'), ('word', 'thomas'), ('word', 'lemar'), ('word', 'steven'), ('word', 'nzonzi'), ('word', 'bosse'), ('word', 'didierdeschamps'), ('word', 'ainsi'), ('word', 'staff'), ('word', 'bonheur')]

Grèce. À cause de sa politique migratoire appuyée par l’Union européenne qui bloque des milliers d’enfants demandeurs d’asile dans les îles de la mer Égée, la Grèce prive  ces enfants de leur droit à l’éducation, a déclaré Human Rights Watch 

In [13]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = [token for _, token in tokenizer(tweet['text+quote+reply'])]
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [14]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 94789


In [15]:
N = len(DOCUMENT_FREQUENCIES)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = math.log(N / df)

In [16]:
VECTORS: List[Dict[int, float]] = []

for tweet in tqdm(TWEETS):
    vector = {}
    
    # TF is 1 as dimensions will be idempotently overwritten
    for token in tweet['tokens']:
        vector[TOKEN_IDS[token]] = INVERSE_DOCUMENT_FREQUENCIES[token]
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)

  0%|          | 0/137757 [00:00<?, ?it/s]

In [42]:
VECTORS[254]

{1173: 0.3392219177138808,
 1174: 0.27395917266349423,
 433: 0.27801079648645943,
 371: 0.32618577518271835,
 611: 0.16106866655730037,
 1175: 0.27360774231947405,
 229: 0.07852205627881487,
 1176: 0.30416522042764255,
 1177: 0.3421793749637862,
 1178: 0.3496124408813681,
 1179: 0.2846150201930425,
 1180: 0.25663325400546066,
 102: 0.20129399510952986}

## Clustering

In [18]:
def clustering():
    edges = []

    for i, A in enumerate(VECTORS):
        tweet_a = TWEETS[i]

        date_bound = tweet_a['date'] + ONE_DAY
        best_metric = None
        best_candidate = None

        for j in range(i + 1, len(VECTORS)):
            tweet_b = TWEETS[j]

            if tweet_b['date'] > date_bound:
                break

            B = VECTORS[j]

            d = sparse_dot_product(A, B)

            if d == 0.0:
                continue

            if best_candidate is None or d > best_metric:
                best_metric = d
                best_candidate = j

        yield (i, best_candidate, best_metric)

EDGES = []

for edge in tqdm(clustering(), unit='tweet', total=len(VECTORS)):
    if edge[1] is None:
        continue
    EDGES.append(edge)

  0%|          | 0/137757 [00:00<?, ?tweet/s]

In [41]:
similar_pair = choice(EDGES)
first_tweet = TWEETS[similar_pair[0]]
second_tweet = TWEETS[similar_pair[1]]
print('Similar tweets (similarity: %f):\n' % similar_pair[2])
print(first_tweet['id'], '-', first_tweet['text+quote+reply'])
print('---')
print(second_tweet['id'], '-', second_tweet['text+quote+reply'])

Similar tweets (similarity: 0.327407):

1025321398603005953 - Hey je suis mort j'ai rêvé que je me faisais friendzone par une meuf que mon cerveau a totalement inventé. Vis ma vie d'éternel friendzoné.  
---
1025443385526693888 - @roronoazon (être tt le monde c sur fait ) wsh tu m’a friendzone ??  


In [34]:
GRAPH = nx.Graph()

for i, j, sim in EDGES:
    if sim < 0.7:
        continue
        
    GRAPH.add_edge(TWEETS[i]['id'], TWEETS[j]['id'])

CLUSTERS = list(nx.connected_components(GRAPH))

## Evaluation

In [35]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in labeled clusters: 18020
Min number of tweets in labeled clusters: 2
Mean number of tweets in labeled clusters: 372.74708171206225
Median number of tweets in labeled clusters: 76


In [36]:
print_cluster_stats(CLUSTERS)

Number of clusters: 10020
Max number of tweets in labeled clusters: 1026
Min number of tweets in labeled clusters: 2
Mean number of tweets in labeled clusters: 3.9781437125748504
Median number of tweets in labeled clusters: 2


In [37]:
best_matching(TRUTH, CLUSTERS, allow_additional_items=True)

(0.9930137048385659, 0.02034559447955233, 0.03223855967971992)