# Sparse reproduction of clustering method

In [1]:
import csv
import math
import networkx as nx
from operator import itemgetter
from datetime import datetime, timedelta
from random import sample, choice
from statistics import mean, median_low
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from fog.tokenizers import WordTokenizer
from fog.metrics import sparse_normalize, sparse_dot_product
from fog.evaluation import best_matching
from twitwi.constants import TWEET_DATETIME_FORMAT
from stop_words import STOP_WORDS_FR
from typing import List, Dict

In [2]:
%load_ext Cython

## Constants and helpers

In [3]:
ONE_DAY = timedelta(days=1)

def parse_date(created_at):
    return datetime.strptime(created_at, TWEET_DATETIME_FORMAT)

## Reading tweets file

In [4]:
with open('../data/event2018.tsv') as f:
    ALL_TWEETS = list(csv.DictReader(f, delimiter='\t'))
    
# Keeping tweets only once (to avoid fuzzy clusters present in the data)
already_seen = set()
TWEETS = []

for tweet in ALL_TWEETS:
    if tweet['id'] in already_seen:
        continue
    
    already_seen.add(tweet['id'])
    TWEETS.append(tweet)

In [5]:
# Adding dates & parsing stuff
for tweet in TWEETS:
    tweet['event'] = int(tweet['event'])
    tweet['date'] = parse_date(tweet['created_at'])
    tweet['timestamp'] = tweet['date'].timestamp()
    tweet['label'] = int(tweet['label'].split('.')[0]) if tweet['label'] else None

In [6]:
# Making suuuuuuuure the tweets are sorted by date
TWEETS = sorted(TWEETS, key=itemgetter('date'))

In [7]:
for k, v in TWEETS[0].items():
    print(k, v)

id 1018722125941755905
label_day 0.0
event 20180716001
text #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren
text+quote+reply #Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren  
image 
url_image 
user1 True
user2 True
user3 True
created_at Mon Jul 16 05:00:56 +0000 2018
label 0
date 2018-07-16 05:00:56
timestamp 1531710056.0


In [8]:
print('Total number of tweets:', len(TWEETS))
print('Total number of events:', len(set(t['event'] for t in TWEETS)))
print('Total number of labels:', len(set(t['label'] for t in TWEETS if t['label'] is not None)))
print('Total number of tweets not labeled', sum(1 if t['label'] is None else 0 for t in TWEETS))

Total number of tweets: 137757
Total number of events: 327
Total number of labels: 257
Total number of tweets not labeled 41961


In [9]:
TRUTH = defaultdict(list)

for tweet in TWEETS:
    if tweet['label'] is None:
        continue
        
    TRUTH[tweet['label']].append(tweet['id'])

TRUTH = list(TRUTH.values())

In [10]:
def print_cluster_stats(clusters):
    lens = [len(cluster) for cluster in clusters]
    
    print('Number of clusters:', len(clusters))
    print('Max number of tweets in labeled clusters:', max(lens))
    print('Min number of tweets in labeled clusters:', min(lens))
    print('Mean number of tweets in labeled clusters:', mean(lens))
    print('Median number of tweets in labeled clusters:', median_low(lens))

In [11]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in labeled clusters: 18020
Min number of tweets in labeled clusters: 2
Mean number of tweets in labeled clusters: 372.74708171206225
Median number of tweets in labeled clusters: 76


## Tokenization & Vectorization

*NOTE: It might be useful to convert tokens to incremental ids to speed up hash computations*

In [12]:
tokenizer = WordTokenizer(
    keep=['word'],
    lower=True,
    unidecode=True,
    split_hashtags=True,
    stoplist=STOP_WORDS_FR + [t + "'" for t in STOP_WORDS_FR],
    reduce_words=True,
    decode_html_entities=True
)

In [13]:
sample_to_tokenize = sample(TWEETS, 5)

for tweet in sample_to_tokenize:
    print(tweet['text+quote+reply'])
    print(list(tokenizer(tweet['text+quote+reply'])))
    print()

Macron reçoit patronat et syndicats pour déminer le terrain social https://t.co/NJ6AT76xHn via @LCI  
[('word', 'macron'), ('word', 'recoit'), ('word', 'patronat'), ('word', 'syndicats'), ('word', 'deminer'), ('word', 'terrain'), ('word', 'social')]

@oceane_barbier9 Panique agoisse crise de tétanie au milieu d'un carrefour  @deborah_lsr Ques qui c’est passez ? 😱
[('word', 'panique'), ('word', 'agoisse'), ('word', 'crise'), ('word', 'tetanie'), ('word', 'milieu'), ('word', 'carrefour'), ('word', 'ques'), ('word', 'passez')]

Que les féministes religieuses qui hurlent au racisme en France se penchent sur la question plutôt que de chouiner au parlement car la France refusela burqa... #CCIF #Lallab et autres Diallo et indigestes de la république #CulCulClan    https://t.co/YLxliy5mJ2  
[('word', 'feministes'), ('word', 'religieuses'), ('word', 'hurlent'), ('word', 'racisme'), ('word', 'france'), ('word', 'penchent'), ('word', 'question'), ('word', 'plutot'), ('word', 'chouiner'), ('word',

In [14]:
DOCUMENT_FREQUENCIES = Counter()

for tweet in tqdm(TWEETS):
    tweet['tokens'] = [token for _, token in tokenizer(tweet['text+quote+reply'])]
    for token in tweet['tokens']:
        DOCUMENT_FREQUENCIES[token] += 1

  0%|          | 0/137757 [00:00<?, ?it/s]

In [15]:
print('Size of vocabulary:', len(DOCUMENT_FREQUENCIES))

Size of vocabulary: 95635


In [16]:
N = len(DOCUMENT_FREQUENCIES)
TOKEN_IDS = {}
INVERSE_DOCUMENT_FREQUENCIES = {}

for i, (token, df) in enumerate(DOCUMENT_FREQUENCIES.items()):
    TOKEN_IDS[token] = i
    INVERSE_DOCUMENT_FREQUENCIES[token] = 1 + math.log((N + 1) / (df + 1))

In [17]:
VECTORS: List[Dict[int, float]] = []

for tweet in tqdm(TWEETS):
    vector = {}
    
    # TF is 1 as dimensions will be idempotently overwritten
    for token in tweet['tokens']:
        vector[TOKEN_IDS[token]] = INVERSE_DOCUMENT_FREQUENCIES[token]
        
    # TODO: I need to make fog's sparse_normalize mutating
    vector = sparse_normalize(vector)
    VECTORS.append(vector)

  0%|          | 0/137757 [00:00<?, ?it/s]

In [18]:
VECTORS[254]

{1184: 0.3335362851343286,
 1185: 0.2775319260089816,
 435: 0.28102368503877057,
 372: 0.32240638755340006,
 614: 0.17995422843611114,
 1186: 0.27722900180324805,
 1187: 0.303528466173047,
 1188: 0.336055150888726,
 1189: 0.34237413480618273,
 1190: 0.2867125870028366,
 1191: 0.26258832347667904,
 105: 0.21478027123325547}

## Clustering

In [19]:
%%cython
import cython
from datetime import timedelta

@cython.boundscheck(False)
@cython.wraparound(False)
def sparse_dot_product(A: dict, B: dict):
    
    # Swapping so we iterate over the smallest set
    if len(A) > len(B):
        A, B = B, A

    cdef float product  = 0.0

    for k, w1 in A.items():
        w2 = B.get(k)

        if w2 is not None:
            product += w1 * w2

    return product

@cython.boundscheck(False)
@cython.wraparound(False)
def clustering(vectors: list, tweets: list):
    cdef float best_metric
    cdef int best_candidate
    cdef float d
    
    cdef float one_day = timedelta(days=1).total_seconds()
    cdef float date_bound
    
    for i, A in enumerate(vectors):
        tweet_a = tweets[i]

        date_bound = tweet_a['timestamp'] + one_day
        best_metric = -1.0
        best_candidate = -1
        d = 0.0

        for j in range(i + 1, len(vectors)):
            tweet_b = tweets[j]

            if tweet_b['timestamp'] > date_bound:
                break

            B = vectors[j]

            d = sparse_dot_product(A, B)

            if d == 0.0:
                continue

            if best_candidate < 0 or d > best_metric:
                best_metric = d
                best_candidate = j
        
        if best_candidate != -1:
            yield (i, best_candidate, best_metric)

In [20]:
EDGES = []

for edge in tqdm(clustering(VECTORS, TWEETS), unit='tweet', total=len(VECTORS)):
    if edge[1] is None:
        continue
    EDGES.append(edge)

  0%|          | 0/137757 [00:00<?, ?tweet/s]

In [21]:
similar_pair = choice(EDGES)
first_tweet = TWEETS[similar_pair[0]]
second_tweet = TWEETS[similar_pair[1]]
print('Similar tweets (similarity: %f):\n' % similar_pair[2])
print(first_tweet['id'], '-', first_tweet['text+quote+reply'])
print('---')
print(second_tweet['id'], '-', second_tweet['text+quote+reply'])

Similar tweets (similarity: 0.296550):

1026538888649748480 - Pq qd je suis sur Twitter h24 y a r et la je quitte Twitter genre 5h ET BAM! On apprend que NORMAN DRAGUE DES MINEURS PR BZ Quel monde  
---
1026585127017697280 - comment ca Norman il baise des mineurs mdr mais ????  


In [71]:
GRAPH = nx.Graph()

for i, j, sim in EDGES:
    i_id = TWEETS[i]['id']
    j_id = TWEETS[j]['id']
    
    GRAPH.add_node(i_id)
    GRAPH.add_node(j_id)
    
    if sim < 0.08:
        continue
        
    GRAPH.add_edge(i_id, j_id)

CLUSTERS = list(nx.connected_components(GRAPH))

## Evaluation

In [30]:
print_cluster_stats(TRUTH)

Number of clusters: 257
Max number of tweets in labeled clusters: 18020
Min number of tweets in labeled clusters: 2
Mean number of tweets in labeled clusters: 372.74708171206225
Median number of tweets in labeled clusters: 76


In [73]:
print_cluster_stats(CLUSTERS)

Number of clusters: 242
Max number of tweets in labeled clusters: 63893
Min number of tweets in labeled clusters: 1
Mean number of tweets in labeled clusters: 568.400826446281
Median number of tweets in labeled clusters: 1


In [72]:
best_matching(TRUTH, CLUSTERS, allow_additional_items=True)

(0.8545864213920271, 0.12889976232091693, 0.12059140549742423)