# 0. Import libraries, set variables

In [3]:
from pymongo import MongoClient
from web.mongo_remote_password import user, password, ip
from pprint import pprint
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import pandas as pd
from unionfind import UnionFind
import logging

from gensim.models import KeyedVectors
from pathlib import Path
import json
import spacy
from gen_model import gen_model, load_data

from pathlib import Path
from bson.objectid import ObjectId
from sklearn import metrics
from collections import namedtuple


%matplotlib inline

#client = MongoClient(f'mongodb://{user}:{password}@{ip}:27017')
#db = client.twitter_news

client = MongoClient(f'mongodb://localhost:27017')
db = client.twitter_news_remote

topic_labeling_results_path = Path('topic_labeling_results/')

# 1. Generate mappings

Mappings:

1. `repr_id => class`
2. `tweet_id => repr_id`
3. `tweet_id => class`

In [4]:
with_3 = list(db.representatives.find({'topic.2': {"$exists": True}}))
with_2 = list(db.representatives.find({'topic.1': {"$exists": True}}))
with_1 = list(db.representatives.find({'topic.0': {"$exists": True}}))

libya = ObjectId('5b171725da870923dcb0478f')
pistorius = ObjectId('5b171726da870923dcb04790')
nepal = ObjectId('5b171726da870923dcb04791')

In [5]:
with_1[:1]

[{'_id': ObjectId('5b171726da870923dcb04970'),
  'event': ObjectId('5b171725da870923dcb0478f'),
  'predef_topic': '5b184122da870950572be266',
  'ranking': 47,
  'topic': [{'info': {'skipped': True},
    'added_timestamp': datetime.datetime(2018, 8, 24, 18, 11, 39, 144000),
    'user_name': 'user_15'},
   {'info': {'custom_topic': 'Initial information of the attack'},
    'added_timestamp': datetime.datetime(2018, 9, 10, 18, 53, 4, 860000),
    'user_name': 'user_2'},
   {'info': {'non_relevant': True},
    'added_timestamp': datetime.datetime(2018, 9, 26, 21, 36, 26, 895000),
    'user_name': 'user_10'}]}]

## Export user labels

In [4]:
libya_path = topic_labeling_results_path / Path('libya.tsv')
pisto_path = topic_labeling_results_path / Path('pistorius.tsv')
nepal_path = topic_labeling_results_path / Path('nepal.tsv')
 
with libya_path.open('w') as f_l, pisto_path.open('w') as f_p, nepal_path.open('w') as f_n:
    f_l.write(f'rep_id\teval\tuser\tdate\ttype\tlabel\n')
    f_p.write(f'rep_id\teval\tuser\tdate\ttype\tlabel\n')
    f_n.write(f'rep_id\teval\tuser\tdate\ttype\tlabel\n')
    
    for rep in with_1:
        if rep['event'] == libya:
            f = f_l
        elif rep['event'] == pistorius:
            f = f_p
        elif rep['event'] == nepal:
            f = f_n
        else:
            print("this should not have happened")
        
        topic_ = rep.get('topic')
        rep_id = rep['_id']

        for i, t in enumerate(topic_):
            info = t['info']
            eval_no = i + 1
            user_name = t['user_name']
            date = t['added_timestamp'].strftime('%Y-%m-%d %H:%M:%S')
            for key, val in info.items():
                if key == 'topics':
                    for t_id in val:
                        f.write(f"{rep_id}\t{eval_no}\t{user_name}\t{date}\ttopic\t{str(t_id)}\n")
                elif key == 'custom_topic':
                    f.write(f'{rep_id}\t{eval_no}\t{user_name}\t{date}\tcustom\t"{val}"\n')
                elif key == 'skipped':
                    f.write(f"{rep_id}\t{eval_no}\t{user_name}\t{date}\tskipped\tskipped\n")
                elif key == 'non_relevant':
                    f.write(f"{rep_id}\t{eval_no}\t{user_name}\t{date}\tnon_relevant\tnon_relevant\n")

## Read from R, and clean labels

In [6]:
labels_libya_path = topic_labeling_results_path / Path('labels_libya.tsv')
labels_pisto_path = topic_labeling_results_path / Path('labels_pistorius.tsv')
labels_nepal_path = topic_labeling_results_path / Path('labels_nepal.tsv')

libya_rep_label = defaultdict(list)
pisto_rep_label = defaultdict(list)
nepal_rep_label = defaultdict(list)

with labels_libya_path.open() as f_l, labels_pisto_path.open() as f_p, labels_nepal_path.open() as f_n:
    for line in f_l:
        tokens = line.split('\t')
        rep_id = tokens[0]
        label = tokens[-1][:-1]
        libya_rep_label[rep_id].append(label)
        
    for line in f_p:
        tokens = line.split('\t')
        rep_id = tokens[0]
        label = tokens[-1][:-1]
        pisto_rep_label[rep_id].append(label)
        
    for line in f_n:
        tokens = line.split('\t')
        rep_id = tokens[0]
        label = tokens[-1][:-1]
        nepal_rep_label[rep_id].append(label)

In [7]:
libya_rep_label

defaultdict(list,
            {'5b171726da870923dcb04a9a': ['5b18412bda870950572be267',
              '5b184131da870950572be268',
              '5b18412bda870950572be267',
              '5b184131da870950572be268',
              '5b18415bda870950572be26a',
              '5b18412bda870950572be267',
              '5b184131da870950572be268'],
             '5b171726da870923dcb04bf9': ['5b18415bda870950572be26a',
              'Attackers are gunmen',
              'non_relevant'],
             '5b171726da870923dcb04bfd': ['5b18413eda870950572be269',
              '5b18415bda870950572be26a'],
             '5b171726da870923dcb04c0a': ['5b18412bda870950572be267',
              '5b184131da870950572be268',
              '5b18412bda870950572be267',
              '5b184131da870950572be268'],
             '5b171726da870923dcb04c2a': ['5b18412bda870950572be267',
              '5b18412bda870950572be267',
              '5b184131da870950572be268',
              '5b18412bda870950572be267',
              

## a) Impute labels based on majority (`rep_topic`)

In [8]:
label_counts = {
    'libya': Counter(),
    'pistorius': Counter(),
    'nepal': Counter()
}

for rep_id, labels in libya_rep_label.items():
    for label in labels:
        label_counts['libya'].update({label: 1})
        
for rep_id, labels in pisto_rep_label.items():
    for label in labels:
        label_counts['pistorius'].update({label: 1})
        
for rep_id, labels in nepal_rep_label.items():
    for label in labels:
        label_counts['nepal'].update({label: 1})
        
for event_name, counts in label_counts.items():
    print(event_name)
    pprint(counts.most_common(10))
    print()

libya
[('5b18412bda870950572be267', 412),
 ('5b184131da870950572be268', 282),
 ('5b18410fda870950572be265', 179),
 ('5b18415bda870950572be26a', 123),
 ('5b184122da870950572be266', 58),
 ('Attackers are gunmen', 28),
 ('non_relevant', 25),
 ('5b18413eda870950572be269', 16),
 ('skipped', 14),
 ('Related to official interviews', 6)]

pistorius
[('5b19ae8bda870974f0f58bc1', 40),
 ('5b19ae8dda870974f0f58bc2', 39),
 ('5b19ae78da870974f0f58bbe', 36),
 ('5b19ae7cda870974f0f58bbf', 25),
 ('5b19ae1ada870974f0f58bbc', 24),
 ('5b19ae98da870974f0f58bc4', 13),
 ('5b19aea1da870974f0f58bc5', 11),
 ('non_relevant', 6),
 ('5b19ae93da870974f0f58bc3', 5),
 ('5b19ae13da870974f0f58bbb', 5)]

nepal
[('5b19af02da87097532fd0483', 89),
 ('5b19af0cda87097532fd0484', 82),
 ('5b19aefeda87097532fd0482', 62),
 ('5b19af21da87097532fd0488', 33),
 ('5b19af1bda87097532fd0487', 26),
 ('5b19af15da87097532fd0486', 26),
 ('5b19af32da87097532fd048a', 22),
 ('5b19af29da87097532fd0489', 14),
 ('non_relevant', 11),
 ('skipped',

In [9]:
rep_topic = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# libya
topics_ranked = [t for t, _ in label_counts['libya'].most_common() if t not in ('skipped', 'non_relevant')]
for rep_id, labels in libya_rep_label.items():
    for topic_r in topics_ranked:
        topic_found = False
        
        for label in labels:
            if label == topic_r:
                rep_topic['libya'][rep_id] = label
                topic_found = True
                break
        
        if topic_found:
            break
            
# pistorius
topics_ranked = [t for t, _ in label_counts['pistorius'].most_common() if t not in ('skipped', 'non_relevant')]
for rep_id, labels in pisto_rep_label.items():
    for topic_r in topics_ranked:
        topic_found = False
        
        for label in labels:
            if label == topic_r:
                rep_topic['pistorius'][rep_id] = label
                topic_found = True
                break
        
        if topic_found:
            break
            
# nepal
topics_ranked = [t for t, _ in label_counts['nepal'].most_common() if t not in ('skipped', 'non_relevant')]
for rep_id, labels in nepal_rep_label.items():
    for topic_r in topics_ranked:
        topic_found = False
        
        for label in labels:
            if label == topic_r:
                rep_topic['nepal'][rep_id] = label
                topic_found = True
                break
        
        if topic_found:
            break            

In [10]:
for ev, rt in rep_topic.items():
    print(ev, len(rt))

libya 291
pistorius 93
nepal 133


## Mapping tweet_id => rep_id (`tweets_rep`)

In [11]:
#client = MongoClient(f'mongodb://localhost:27017')
#db = client.twitter_news_remote

all_tweets = list(db.tweets.find())

tweets_rep = dict()

for t in tqdm(all_tweets):
    tweets_rep[t['tweet_id']] = str(t['representative'])

100%|██████████| 642251/642251 [00:00<00:00, 883931.68it/s]


In [12]:
tweets_rep

{560029596029292544: '5b171726da870923dcb04792',
 560025454057619456: '5b171726da870923dcb04792',
 560029586122354688: '5b171726da870923dcb04792',
 560029575078375424: '5b171726da870923dcb04792',
 560029491360436224: '5b171726da870923dcb04792',
 560029466676961280: '5b171726da870923dcb04792',
 560029444988223488: '5b171726da870923dcb04792',
 560029390998757376: '5b171726da870923dcb04792',
 560029198728048640: '5b171726da870923dcb04792',
 560029166880296961: '5b171726da870923dcb04792',
 560029115496280065: '5b171726da870923dcb04792',
 560029057291522050: '5b171726da870923dcb04792',
 560028825615364096: '5b171726da870923dcb04792',
 560028825380474880: '5b171726da870923dcb04792',
 560028759630569472: '5b171726da870923dcb04792',
 560028692454572032: '5b171726da870923dcb04792',
 560028685680803840: '5b171726da870923dcb04792',
 560028623433134081: '5b171726da870923dcb04792',
 560028494424317952: '5b171726da870923dcb04792',
 560028468751380480: '5b171726da870923dcb04792',
 560028325314580480:

### ground truth: (`tweet_topic`) (MAJORITY)

In [13]:
tweet_topic = dict()

for tweet_id, repr_id in tweets_rep.items():
    for ev, rep_topic_ in rep_topic.items():
        topic_ = rep_topic_.get(repr_id)
        if topic_:
            tweet_topic[tweet_id] = topic_

In [14]:
tweet_topic

{560029502903156736: '5b18412bda870950572be267',
 560022451749666816: '5b18412bda870950572be267',
 560026425277689856: '5b18412bda870950572be267',
 560026218192723969: '5b18412bda870950572be267',
 560025013471170560: '5b18412bda870950572be267',
 560024824840716289: '5b18412bda870950572be267',
 560024268281761793: '5b18412bda870950572be267',
 560024226057703425: '5b18412bda870950572be267',
 560024222740004864: '5b18412bda870950572be267',
 560024144717545472: '5b18412bda870950572be267',
 560023999733043201: '5b18412bda870950572be267',
 560023926844452864: '5b18412bda870950572be267',
 560023734762102784: '5b18412bda870950572be267',
 560023571670790144: '5b18412bda870950572be267',
 560023568860610560: '5b18412bda870950572be267',
 560023512346533888: '5b18412bda870950572be267',
 560023456277069824: '5b18412bda870950572be267',
 560023325183709185: '5b18412bda870950572be267',
 560023098381312001: '5b18412bda870950572be267',
 560022989647798272: '5b18412bda870950572be267',
 560022985785221120:

In [15]:
with (topic_labeling_results_path / Path('tweet_topic.tsv')).open('w') as f:
    for tweet_id, topic_id in tweet_topic.items():
        f.write(f'{tweet_id}\t{topic_id}\n')

In [16]:
topic_labeling_results_path

PosixPath('topic_labeling_results')

# b) impute labels based on connected components

In [68]:
rep_pairs = {
    'libya': [],
    'pistorius': [],
    'nepal': []
}

for rep, labels in libya_rep_label.items():
    for l in labels:
        if l == 'non_relevant' or l == 'skipped':
            continue
        rep_pairs['libya'].append((rep, l))
        
for rep, labels in pisto_rep_label.items():
    for l in labels:
        if l == 'non_relevant' or l == 'skipped':
            continue
        rep_pairs['pistorius'].append((rep, l))

for rep, labels in nepal_rep_label.items():
    for l in labels:
        if l == 'non_relevant' or l == 'skipped':
            continue
        rep_pairs['nepal'].append((rep, l))


In [75]:
uf_events = {
    'libya': UnionFind(),
    'pistorius': UnionFind(),
    'nepal': UnionFind()
}

for event, pairs in rep_pairs.items():
    for u, v in pairs:
        uf_events[event].union(u, v)
    
    print(event, len(uf_events[event].components()), 'components')
    for c in uf_events[event].components():
        print(len(c))
    print()

libya 3 components
316
2
3

pistorius 4 components
76
3
11
17

nepal 1 components
145




# Gen model

In [15]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])
we = KeyedVectors.load_word2vec_format('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')

2018-10-30 12:51:16,997 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-10-30 12:52:16,206 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec


In [16]:
def gen_model2(ename):
    uf, event_data = gen_model(ename)
    return {
        'uf': uf,
        'event_data': event_data
    }

# event_name: (uf, event_data)
models = {
    'libya': gen_model2('libya_hotel_tweets.tsv'),
    'pistorius': gen_model2('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model2('nepal_tweets.tsv')
}

2018-10-30 12:52:39,671 : load and clean dataset: libya_hotel_tweets.tsv
2018-10-30 12:52:39,956 : tweets processed: 26331, ignored: 2309, missing urls: 6341
2018-10-30 12:52:39,956 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-10-30 12:52:39,974 : total pairs: 20108, retweets: 11323, quotes: 0, replies: 289 (missing: 0)
2018-10-30 12:52:39,974 : applying union-find
2018-10-30 12:52:40,101 : total components: 2747
2018-10-30 12:52:40,102 : load and clean dataset: oscar_pistorius_tweets.tsv
2018-10-30 12:52:40,617 : tweets processed: 112260, ignored: 955, missing urls: 21807
2018-10-30 12:52:40,617 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-10-30 12:52:40,687 : total pairs: 66804, retweets: 26179, quotes: 0, replies: 1553 (missing: 0)
2018-10-30 12:52:40,688 : applying union-find
2018-10-30 12:52:41,343 : total components: 8676
2018-10-30 12:52:41,346 : load and clean dataset: nepal_twee

## Convert components into (urls,...) => (ids,...) (`docs[event_name][key] => [ids...]`)

In [189]:
event_names = ('libya', 'pistorius', 'nepal')

docs = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

url_indices = { 
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}


for event_name in event_names:
    for component in models[event_name]['uf'].components():
        comp_key = []
        comp_ids = []
        for elem in component:
            if elem.startswith('http'):
                comp_key.append(elem)
            else:
                t = models[event_name]['event_data'].get(elem)
                if not t:
                    print("err")
                comp_ids.append(t.tweet_id)
        
        # component does not have url
        if not comp_key:
            comp_key.append(np.random.choice(comp_ids))
        
        comp_key = tuple(comp_key)
        key = hash(comp_key)
        
        docs[event_name][key] = comp_ids
        url_indices[event_name][key] = comp_key

## Generate vectors using fasttext

In [190]:
vecs = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# for each event
for event_name, docs_event in tqdm(docs.items(), total=len(docs)):
    # for each component in this event
    for key, tweet_ids in tqdm(docs_event.items(), total=len(docs_event)):
        vec = []
        texts = [models[event_name]['event_data'][twid].text for twid in tweet_ids]
        
        for tokens in nlp.pipe(texts, n_threads=-1):
            for token in tokens:
                if not token.like_url and token.lower_ in we:
                    v = we[token.lower_]
                    vec.append(v)
        
        if vec:
            avg_vec = np.array(vec).mean(axis=0)
            vecs[event_name][key] = avg_vec
        else:
            print("no vec")
            

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/2747 [00:00<?, ?it/s][A
  0%|          | 3/2747 [00:01<15:21,  2.98it/s][A
  1%|          | 19/2747 [00:01<02:49, 16.13it/s][A
  5%|▍         | 129/2747 [00:01<00:26, 98.52it/s][A
  8%|▊         | 229/2747 [00:01<00:15, 162.43it/s][A
 16%|█▌        | 434/2747 [00:01<00:08, 286.39it/s][A
 21%|██▏       | 585/2747 [00:01<00:05, 362.02it/s][A
 25%|██▌       | 695/2747 [00:01<00:05, 383.58it/s][A
 29%|██▊       | 789/2747 [00:01<00:04, 411.21it/s][A
 32%|███▏      | 881/2747 [00:02<00:04, 436.38it/s][A
 35%|███▌      | 973/2747 [00:02<00:03, 459.04it/s][A
 40%|████      | 1102/2747 [00:02<00:03, 496.45it/s][A
 46%|████▌     | 1256/2747 [00:02<00:02, 541.44it/s][A
 52%|█████▏    | 1437/2747 [00:02<00:02, 593.00it/s][A
 59%|█████▉    | 1619/2747 [00:02<00:01, 641.57it/s][A
 64%|██████▍   | 1769/2747 [00:02<00:01, 670.96it/s][A
 70%|██████▉   | 1914/2747 [00:02<00:01, 698.16it/s][A
 76%|███████▌  | 2076/2747 [00:02<00:0

no vec



 24%|██▍       | 2084/8676 [00:08<00:27, 243.47it/s][A
 25%|██▌       | 2210/8676 [00:08<00:25, 255.20it/s][A
 27%|██▋       | 2378/8676 [00:08<00:23, 271.46it/s][A
 29%|██▉       | 2511/8676 [00:08<00:21, 283.28it/s][A
 30%|███       | 2642/8676 [00:08<00:20, 293.67it/s][A
 32%|███▏      | 2764/8676 [00:09<00:19, 303.35it/s][A
 33%|███▎      | 2895/8676 [00:09<00:18, 314.10it/s][A
 35%|███▍      | 3015/8676 [00:09<00:17, 323.00it/s][A
 36%|███▌      | 3129/8676 [00:09<00:16, 330.64it/s][A
 37%|███▋      | 3240/8676 [00:09<00:16, 338.77it/s][A
 39%|███▉      | 3373/8676 [00:09<00:15, 349.01it/s][A
 41%|████      | 3514/8676 [00:09<00:14, 359.88it/s][A
 42%|████▏     | 3637/8676 [00:09<00:13, 365.23it/s][A
 44%|████▍     | 3813/8676 [00:10<00:12, 379.03it/s][A
 45%|████▌     | 3939/8676 [00:10<00:12, 386.82it/s][A
 47%|████▋     | 4058/8676 [00:10<00:11, 393.66it/s][A
 48%|████▊     | 4196/8676 [00:10<00:11, 403.13it/s][A
 50%|█████     | 4345/8676 [00:10<00:10, 413.46

no vec



 12%|█▏        | 2109/17718 [00:34<04:13, 61.64it/s][A
 12%|█▏        | 2162/17718 [00:34<04:09, 62.31it/s][A
 13%|█▎        | 2231/17718 [00:34<04:01, 64.12it/s][A
 13%|█▎        | 2331/17718 [00:34<03:50, 66.80it/s][A
 14%|█▎        | 2413/17718 [00:34<03:41, 68.95it/s][A
 14%|█▍        | 2483/17718 [00:35<03:35, 70.71it/s][A
 14%|█▍        | 2550/17718 [00:35<03:29, 72.40it/s][A
 15%|█▍        | 2616/17718 [00:35<03:24, 73.82it/s][A
 15%|█▌        | 2685/17718 [00:35<03:18, 75.54it/s][A
 16%|█▌        | 2764/17718 [00:35<03:12, 77.55it/s][A
 16%|█▌        | 2842/17718 [00:35<03:07, 79.51it/s][A

no vec



 17%|█▋        | 2957/17718 [00:35<02:58, 82.48it/s][A
 17%|█▋        | 3038/17718 [00:35<02:53, 84.48it/s][A
 18%|█▊        | 3116/17718 [00:36<02:49, 86.25it/s][A
 18%|█▊        | 3185/17718 [00:36<02:45, 87.63it/s][A
 18%|█▊        | 3247/17718 [00:36<02:42, 89.08it/s][A
 19%|█▉        | 3340/17718 [00:36<02:37, 91.23it/s][A
 19%|█▉        | 3399/17718 [00:36<02:34, 92.53it/s][A
 20%|█▉        | 3496/17718 [00:36<02:29, 94.92it/s][A
 20%|██        | 3615/17718 [00:36<02:24, 97.87it/s][A
 21%|██        | 3727/17718 [00:37<02:19, 100.63it/s][A
 22%|██▏       | 3818/17718 [00:37<02:15, 102.65it/s][A
 22%|██▏       | 3899/17718 [00:37<02:12, 104.46it/s][A
 23%|██▎       | 3995/17718 [00:37<02:08, 106.74it/s][A
 23%|██▎       | 4133/17718 [00:37<02:03, 109.82it/s][A
 24%|██▍       | 4240/17718 [00:37<01:59, 112.36it/s][A
 24%|██▍       | 4326/17718 [00:37<01:57, 114.23it/s][A
 25%|██▌       | 4445/17718 [00:37<01:53, 117.03it/s][A
 26%|██▌       | 4533/17718 [00:38<01:5

In [191]:
for event_name, vec_info in vecs.items():
    with open(f'data_local_events/{event_name}_vectors.tsv', 'w') as f:
        for key, vec in vec_info.items():
            values = "\t".join([str(v) for v in vec])
            f.write(f"{key}\t{values}\n")

# Gen raw vectors

In [19]:
vecs_raw = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# for each event
for event_name, docs_event in tqdm(docs.items(), total=len(docs)):
    
    # for each component in this event
    for _, tweet_ids in tqdm(docs_event.items(), total=len(docs_event)):
        
        # for each tweet in this component
        for twid in tweet_ids:
            vec = []
            text = models[event_name]['event_data'][twid].text
            tokens = nlp(text)

            # for each token in the text of this tweet
            for token in tokens:
                if not token.like_url and token.lower_ in we:
                    v = we[token.lower_]
                    vec.append(v)

            # if there is info in the vec, compute avg vec
            if vec:
                avg_vec = np.array(vec).mean(axis=0)
                vecs_raw[event_name][twid] = avg_vec

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/2747 [00:00<?, ?it/s][A
  0%|          | 3/2747 [00:01<21:11,  2.16it/s][A
  1%|          | 19/2747 [00:01<03:52, 11.73it/s][A
  4%|▍         | 118/2747 [00:01<00:38, 68.60it/s][A
  5%|▌         | 150/2747 [00:01<00:31, 82.00it/s][A
  8%|▊         | 229/2747 [00:01<00:21, 116.86it/s][A
 15%|█▌        | 417/2747 [00:02<00:11, 202.30it/s][A
 19%|█▉        | 532/2747 [00:02<00:08, 246.14it/s][A
 23%|██▎       | 625/2747 [00:02<00:07, 268.24it/s][A
 26%|██▌       | 707/2747 [00:02<00:07, 283.26it/s][A
 28%|██▊       | 779/2747 [00:02<00:06, 298.44it/s][A
 31%|███       | 849/2747 [00:02<00:06, 311.83it/s][A
 34%|███▍      | 928/2747 [00:02<00:05, 328.68it/s][A
 37%|███▋      | 1025/2747 [00:02<00:04, 349.83it/s][A
 41%|████▏     | 1137/2747 [00:03<00:04, 375.13it/s][A
 45%|████▌     | 1239/2747 [00:03<00:03, 395.69it/s][A
 50%|█████     | 1385/2747 [00:03<00:03, 428.40it/s][A
 56%|█████▌    | 1532/2747 [00:03<00:02, 

 72%|███████▏  | 12761/17718 [01:03<00:24, 200.90it/s][A
 72%|███████▏  | 12837/17718 [01:03<00:24, 201.72it/s][A
 73%|███████▎  | 12960/17718 [01:03<00:23, 203.33it/s][A
 74%|███████▎  | 13051/17718 [01:03<00:22, 204.31it/s][A
 74%|███████▍  | 13132/17718 [01:04<00:22, 205.03it/s][A
 75%|███████▍  | 13226/17718 [01:04<00:21, 206.17it/s][A
 75%|███████▌  | 13307/17718 [01:04<00:21, 207.11it/s][A
 76%|███████▌  | 13410/17718 [01:04<00:20, 208.38it/s][A
 76%|███████▋  | 13538/17718 [01:04<00:19, 210.05it/s][A
 77%|███████▋  | 13679/17718 [01:04<00:19, 211.72it/s][A
 78%|███████▊  | 13775/17718 [01:04<00:18, 212.60it/s][A
 78%|███████▊  | 13903/17718 [01:04<00:17, 214.24it/s][A
 79%|███████▉  | 13999/17718 [01:05<00:17, 215.24it/s][A
 80%|███████▉  | 14095/17718 [01:05<00:16, 216.32it/s][A
 80%|████████  | 14181/17718 [01:05<00:16, 217.30it/s][A
 81%|████████  | 14266/17718 [01:05<00:15, 218.18it/s][A
 81%|████████  | 14377/17718 [01:05<00:15, 219.53it/s][A
 82%|████████▏

In [20]:
for event_name, vec_info in vecs_raw.items():
    with open(f'data_local_events/{event_name}_raw-vectors.tsv', 'w') as f:
        for key, vec in vec_info.items():
            key = str(key)
            values = "\t".join([str(v) for v in vec])
            f.write(f"{key}\t{values}\n")

# \**perform clustering in R*\*

## purity measure

In [50]:
def purity_score(clusters: dict, classes: dict):
    """
    - clusters: cluster_id => {tweet_id}
    - classes: class_id => {tweet_id}
    
    computes purity score using partial labelled data
    
    P = 1/N * sum_k ( max_j |c_j \cap w_k| )
        
    N = total elements in clusters
    
    c_j = class j
    
    w_k = cluster k    
    """
    
    N = sum(len(w) for w in classes.values())
    p = 0
    
    for cluster, clu_twids in clusters.items():
        #N += len(clu_twids)
        cluster_twids = set(clu_twids)
        
        max_int = 0
        for cls, cla_twids in classes.items():
            class_twids = set(cla_twids)
            
            if len(cluster_twids & class_twids) > max_int:
                max_int = len(cluster_twids & class_twids)
                
        p += max_int
        
    return p / N

In [53]:
##### test purity

test_clusters = {
    1: {1, 2, 3, 4},
    2: {5, 6, 7},
    3: {8, 9}
}

test_classes = {
    'a': {1, 8, 9},
    'b': {4},
    'c': {6, 7}
}

assert purity_score(test_clusters, test_classes) == 5/6

test_clusters = {
    1: {1,2,3,4,5,6,7,8,9},
    2: {10, 11},
    3: {12, 13, 14},
    4: {15}
}

test_classes = {
    1: {5, 11, 13, 15}
}

assert purity_score(test_clusters, test_classes) == 1

## NMI

In [103]:
def mutual_info(clusters, classes):
    N = sum(len(w) for w in clusters.values())
    N_c = sum(len(w) for w in classes.values())
    mi = 0
    for w_k in clusters.values():
        for c_j in classes.values():
            len_match = len(set(w_k) & set(c_j))
            if len_match == 0: 
                continue
            mi += (len_match / N_c) * np.log2(N * len_match / (len(w_k) * len(c_j)))
    return mi

def entropy(clusters):
    H = 0
    N = sum(len(w) for w in clusters.values())
    for w_k in clusters.values():
        H += -len(w_k) / N * np.log2(len(w_k) / N)
    return H

def nmi_score(clusters, classes):
    return 2 * mutual_info(clusters, classes) / (entropy(clusters) + entropy(classes))

In [104]:
test_clusters = {
    1: {1, 2, 3, 4},
    2: {5, 6, 7},
    3: {8, 9}
}
test_classes = {
    'a': {1, 8, 9},
    'b': {4},
    'c': {6, 7}
}
nmi = nmi_score(test_clusters, test_classes)
print(nmi)


test_clusters = {
    1: {1,2,3,4,5,6,7,8,9},
    2: {10, 11},
    3: {12, 13, 14},
    4: {15}
}
test_classes = {
    1: {5, 11, 13, 15}
}

nmi = nmi_score(test_clusters, test_classes)
print(nmi)


test_clusters = {
    1: {1, 2, 3, 10, 13, 16, 17, 18},
    2: {4, 5, 6, 11, 14},
    3: {7, 8, 9, 12, 15}
}

test_classes = {
    1: {1, 4, 7},
    2: {2, 5, 8},
    3: {3, 6, 9}
}

nmi = nmi_score(test_clusters, test_classes)
print(entropy_score(test_clusters, test_classes))
print(nmi)

test_clusters = {
    1: {1, 2, 3, 7, 8},
    2: {4, 5, 6}
}

test_classes = {
    1: {1, 2, 3},
    2: {4, 5, 6}
}

nmi = nmi_score(test_clusters, test_classes)
print(entropy_score(test_clusters, test_classes))
print(nmi)  # > 1

0.7910356201363014
0.6022974147370946
1.5849625007211563
0.02363680355155979
-0.0
1.070954251337716


# Entropy

In [105]:
def entropy_score(clusters, classes):
    N_c = sum(len(w) for w in classes.values())
    n_ck = defaultdict(int)  # elems in cluster k with label
    values_with_label = set([e for cls in classes.values() for e in cls])
    
    for k, w_k in clusters.items():
        for e in set(w_k):
            if e in values_with_label:
                n_ck[k] += 1
    
    H = 0
    for k, w_k in clusters.items():
        for j, c_j in classes.items():
            m_ij = len(set(w_k) & set(c_j))
            if m_ij == 0:
                continue
            lg = np.log2(m_ij / n_ck[k])
            H += m_ij * lg
    return - H / N_c
        
        

In [106]:
test_class = {
    1: {1, 2, 3},
    2: {4, 5, 6},
    3: {7, 8}
}

test_cluster = {
    1: {1, 2, 3, 9, 10, 11},
    2: {4, 5, 6, 12, 13},
    3: {7, 8, 14}
}

assert entropy_score(test_cluster, test_class) == 0


test_class = {
    1: {1, 2, 3},
    2: {4, 5, 6},
    3: {7, 8, 9}
}

test_cluster = {
    1: {1, 4, 10},
    2: {5, 7, 11},
    3: {2, 8, 12},
    4: {3, 6, 9}
}

print(entropy_score(test_cluster, test_class))
-1/9 * (3 * np.log2(1/3) + 6 * np.log2(1/2))

1.1949875002403854


1.1949875002403856

# compute purity on clusterings @ model (deprecated?)

In [107]:
results_model = []

clustering_files = Path('data_local_events/clustering_results/').glob('*.tsv')

for cl_file in clustering_files:
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
    _cluster_data = dict()
    
    ##### read cluster file
    with cl_file.open() as f:
        for line in f:
            tokens = line.split('\t')
            key, cluster_id = tokens[0].split(',,,'), int(tokens[1][:-1])
            _cluster_data[tuple(key)] = cluster_id
            
    ##### create dict for clusters
    twid_cluster = defaultdict(list)  # clusters
    for key, ids in docs[event_name].items():
        cluster_id = _cluster_data.get(key)  # there is no vector bc there was no data about that url
        if not cluster_id:
            continue
        for _id in ids:
            twid_cluster[cluster_id].append(int(_id))

    #### create dict for classes
    topic_twid_ev = defaultdict(set)  # classes for this event
    for twid in models[event_name]['event_data']:
        tweet_id = int(twid)
        topic_id = tweet_topic.get(tweet_id)
        if topic_id:
            topic_twid_ev[topic_id].add(tweet_id)

    #### compute measures
    purity = purity_score(twid_cluster, topic_twid_ev)
    nmi = nmi_score(twid_cluster, topic_twid_ev)
    entropy_sc = entropy_score(twid_cluster, topic_twid_ev)
    
    #### save results
    results_model.append((event_name, method_name, k_clusters, purity, nmi, entropy_sc))

### res for raw vectors (deprecated?)

In [108]:
raw_files = {
    'libya': Path('data_local_events/no_model/libya_raw-vectors.tsv'), 
    'pistorius': Path('data_local_events/no_model/pistorius_raw-vectors.tsv'), 
    'nepal': Path('data_local_events/no_model/nepal_raw-vectors.tsv')
}

results_raw = []
clustering_files = list(Path('data_local_events/clustering_results/no_model/').glob('*_raw-*.tsv'))

for cl_file in tqdm(clustering_files):
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
        
    twid_cluster = defaultdict(list)  # clusters
    
    ##### read cluster file
    with cl_file.open() as f, raw_files[event_name].open() as g:
        for line_f, line_g in zip(f, g):
            tokens = line_f.split('\t')
            tokens_g = line_g.split('\t')
            
            key = tokens_g[0]
            cluster_id = int(tokens[1][:-1])
            twid_cluster[cluster_id].append(int(key))
            
    #### create dict for classes
    topic_twid_ev = defaultdict(set)  # classes for this event
    for twid in models[event_name]['event_data']:
        tweet_id = int(twid)
        topic_id = tweet_topic.get(tweet_id)
        if topic_id:
            topic_twid_ev[topic_id].add(tweet_id)

    #### compute measures
    purity = purity_score(twid_cluster, topic_twid_ev)
    nmi = nmi_score(twid_cluster, topic_twid_ev)
    entropy_sc = entropy_score(twid_cluster, topic_twid_ev)
    
    #### save results
    results_raw.append((event_name, method_name, k_clusters, purity, nmi, entropy_sc))

100%|██████████| 66/66 [00:42<00:00,  1.55it/s]


In [44]:
results_raw

[('libya',
  'raw-hc-euclidean-ward',
  12,
  0.016184709410899212,
  0.00997547852641578),
 ('pistorius', 'raw-kmeans', 5, 0.0005996055226824457, 0.00028307237788953487),
 ('pistorius', 'raw-hc-euclidean-ward', 4, 0.00035, -0.000477529341918328),
 ('libya', 'raw-kmeans', 7, 0.014939731763906967, 0.009507655324541231),
 ('nepal', 'raw-kmeans', 4, 0.0009223133074033847, 0.0008267978210112593),
 ('pistorius', 'raw-hc-euclidean-ward', 2, 0.0003, -0.00070376286356459),
 ('nepal', 'raw-kmeans', 9, 0.0011225523807212247, 0.001199585897615439),
 ('nepal', 'raw-hc-euclidean-ward', 12, 0.00025, -0.00027367567174501473),
 ('nepal', 'raw-kmeans', 10, 0.001113450604661323, 0.0011037844960540646),
 ('libya',
  'raw-hc-euclidean-ward',
  10,
  0.016184709410899212,
  0.010576026619555914),
 ('pistorius', 'raw-hc-euclidean-ward', 12, 0.000475, -7.144047534692579e-06),
 ('nepal', 'raw-hc-euclidean-ward', 10, 0.000225, -0.00033603107678420506),
 ('libya', 'raw-kmeans', 11, 0.016863788127440438, 0.00850

In [109]:
df_res = pd.DataFrame.from_records(results_model + results_raw).sort_values(by=[0, 1, 2])
df_res

Unnamed: 0,0,1,2,3,4,5
7,libya,hc-euclidean-ward,2,0.070450,-0.082809,0.515097
56,libya,hc-euclidean-ward,3,0.070450,-0.070095,0.515097
16,libya,hc-euclidean-ward,4,0.070450,-0.067798,0.515097
31,libya,hc-euclidean-ward,5,0.114481,-0.007137,0.426162
49,libya,hc-euclidean-ward,6,0.114481,-0.006744,0.426162
24,libya,hc-euclidean-ward,7,0.133072,0.021609,0.362794
52,libya,hc-euclidean-ward,8,0.133072,0.021138,0.362794
39,libya,hc-euclidean-ward,9,0.133072,0.020617,0.362794
47,libya,hc-euclidean-ward,10,0.137965,0.031472,0.345456
63,libya,hc-euclidean-ward,11,0.137965,0.036472,0.343328


In [110]:
df_res.to_csv('data_local_events/clustering_results/results.csv')

# compute measures on data restricted to labeled data

In [297]:
def entropy2(labels_true, labels_pred):
    m_i = Counter(labels_pred)  # no. of elements in cluster i
    m_ij = defaultdict(lambda: defaultdict(int))  # no. of elements of class j in cluster i
    p_ij = defaultdict(dict)  # proba of elem of cluster i belongs to class j
    e_i = defaultdict(float)  # entropy of cluster i
    e = 0
    m = sum(m_i.values())
    
    for pred, true in zip(labels_pred, labels_true):
        m_ij[pred][true] += 1
    
    for pred in set(labels_pred):
        for true in set(labels_true):
            p_ij[pred][true] = m_ij[pred][true] / m_i[pred]
        
    for pred in set(labels_pred):
        for true in set(labels_true):
            if p_ij[pred][true] == 0:
                continue
            e_i[pred] -= p_ij[pred][true] * np.log2(p_ij[pred][true])
    
    for pred in set(labels_pred):
        e += m_i[pred] * e_i[pred] / m
        
    return e
            

def purity2(labels_true, labels_pred):
    m_i = Counter(labels_pred)  # no. of elements in cluster i
    m_ij = defaultdict(lambda: defaultdict(int))  # no. of elements of class j in cluster i
    p_ij = defaultdict(dict)  # proba of elem of cluster i belongs to class j
    p_i = defaultdict(float)  # purity of cluster i
    p = 0
    m = sum(m_i.values())
    
    for pred, true in zip(labels_pred, labels_true):
        m_ij[pred][true] += 1
    
    for pred in set(labels_pred):
        for true in set(labels_true):
            p_ij[pred][true] = m_ij[pred][true] / m_i[pred]

    for pred in set(labels_pred):
        p_i[pred] = max(p_ij[pred][true] for true in set(labels_true))
    
    p = sum(p_i[pred] * m_i[pred] for pred in set(labels_pred)) / m
    return p

In [302]:
labels_true = [1, 2, 3, 4, 5, 6, 7]
labels_pred = [0, 1, 2, 3, 4, 5, 6]

entropy2(labels_true, labels_pred)
purity2(labels_true, labels_pred)

1.0

In [293]:
Counter(labels_pred)

Counter({0: 7})

In [317]:
Result = namedtuple('Result', 'event_name method_name model k_clusters labels_size adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score entropy purity')

results_model = []

clustering_files = sorted(Path('data_local_events/clustering_results/').glob('*.tsv'))

for cl_file in clustering_files:
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
    _cluster_data = dict()
    
    ##### read cluster file
    with cl_file.open() as f:
        for line in f:
            tokens = line.split('\t')
            key, cluster_id = int(tokens[0]), int(tokens[1][:-1])  # hash(urls)\tcluster_id\n
            _cluster_data[key] = cluster_id

    ###### labels_true_d
    """
    las componentes sacadas de uf no consideran componentes sin url
    event_data tiene a todos los tweets
    en la evaluacion hubo tweets sin URL etiquetados
    por lo tanto, este loop saca más tweets etiquetados que restringirlos a solo los del clustering 
    (1000 vs 400 en libya)
    
    for tweet_id_str in models[event_name]['event_data']:
        tweet_id = int(tweet_id_str)
        if tweet_id in tweet_topic:
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
    """
    
    ###### labels
    labels_true_d = dict()
    labels_pred_d = dict()
    for key, tweet_ids_str in docs[event_name].items():
        cluster_id = _cluster_data.get(key)
        if not cluster_id:
            continue
        for twid_str in tweet_ids_str:
            tweet_id = int(twid_str)
            if tweet_id in tweet_topic:
                labels_pred_d[tweet_id] = cluster_id
                labels_true_d[tweet_id] = tweet_topic[tweet_id]
                
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name=method_name,
        model="model",
        k_clusters=k_clusters,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )

    results_model.append(res)

In [238]:
for tid in tweet_topic:
    tid = str(tid)
    if tid in models['libya']['event_data']:
        t = models['libya']['event_data'][tid]
        print(t.expanded_urls)

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{0: 'http://edition.cnn.com/2015/01/27/middleeast/libya-corinthia-hotel-attack/index.html'}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{0: 'http://www.bbc.co.uk/news/world-africa-31001094'}
{0: 'http://www.bbc.co.uk/news/world-africa

In [323]:
################# RAW

results_raw = []
clustering_files = list(Path('data_local_events/clustering_results/no_model/').glob('*.tsv'))

for cl_file in tqdm(clustering_files):
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    method_name = '-'.join(method_name.split('-')[1:])
    
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
    
    labels_pred_d = dict()
    labels_true_d = dict()
    with cl_file.open() as f:
        for line in f:
            tokens = line.split('\t')
            tweet_id, cluster_id = int(tokens[0]), int(tokens[1][:-1])
            if tweet_id in tweet_topic:
                labels_pred_d[tweet_id] = cluster_id
                labels_true_d[tweet_id] = tweet_topic[tweet_id]
    
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])

    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name=method_name,
        model="baseline",
        k_clusters=k_clusters,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )

    results_raw.append(res)

100%|██████████| 66/66 [00:02<00:00, 32.66it/s]


In [330]:
df = pd.DataFrame.from_records(results_raw + results_model, columns=Result._fields)
df.to_csv('/home/mquezada/news-model-git/news-model/data_local_events/clustering_results/results.csv', index=False)

In [329]:
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,hc-euclidean-ward,baseline,12,483,0.141648,0.343187,0.378428,0.360146,0.397638,0.377965,0.379478,1.454196,0.592133
1,pistorius,kmeans,baseline,5,56,0.215724,0.291890,0.473205,0.400672,0.558870,0.466730,0.373253,1.726743,0.500000
2,pistorius,hc-euclidean-ward,baseline,4,56,0.280626,0.342512,0.547862,0.414613,0.723934,0.527255,0.465003,1.686575,0.517857
3,libya,kmeans,baseline,7,483,0.452592,0.446485,0.497683,0.455783,0.543434,0.495764,0.598191,1.236841,0.687371
4,nepal,kmeans,baseline,4,45,0.742925,0.698187,0.798194,0.726694,0.876729,0.794692,0.820283,0.552616,0.844444
5,pistorius,hc-euclidean-ward,baseline,2,56,0.148371,0.182682,0.372794,0.218000,0.637502,0.324898,0.402747,2.253044,0.321429
6,nepal,kmeans,baseline,9,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
7,nepal,hc-euclidean-ward,baseline,12,45,0.865487,0.768323,0.862767,0.924204,0.805415,0.860730,0.899118,0.153258,0.955556
8,nepal,kmeans,baseline,10,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
9,libya,hc-euclidean-ward,baseline,10,483,0.181497,0.336665,0.386857,0.351164,0.426177,0.385051,0.415540,1.474608,0.592133


# Analysis of components

Compute evaluation measures on the largest component of each event

In [343]:
results_component = []

# for each event
for event_name, _docs in docs.items():
    largest_component = max([(component_id, len(component)) for component_id, component in _docs.items()], key=lambda c: c[1])
    
    component_id = largest_component[0]
    component = _docs[component_id]
    
    labels_pred_d = dict()
    labels_true_d = dict()
    for twid in component:
        tweet_id = int(twid)
        if tweet_id in tweet_topic:
            labels_pred_d[tweet_id] = 0  # only 1 cluster/component
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
        
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name="largest component",
        model=f"size={len(component)}",
        k_clusters=1,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )
    results_component.append(res)

In [344]:
df = pd.DataFrame.from_records(results_component, columns=Result._fields)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,largest component,size=6351,1,118,0.0,3.738743e-16,3e-06,1.945201e-16,1.0,3.890402e-16,0.530812,2.058544,0.415254
1,pistorius,largest component,size=23373,1,19,0.0,-4.440684e-16,-3e-06,-2.208717e-16,1.0,-4.417434e-16,0.524265,1.812945,0.421053
2,nepal,largest component,size=182888,1,54,0.0,0.0,0.0,0.0,1.0,0.0,0.573098,1.899054,0.407407


# Model v2

Consider a tweet with more than 1 url as different tweets (diff. ids)

In [415]:
#### TODO: append _{i} to tweet ids in replies and retweets


logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

def gen_model_uniq_urls(dataset_name):
    event_data, missing_urls_amount = load_data(dataset_name)

    ##########
    # create set of tweet_ids
    # for a given tweet t:
    # if t does not have urls: add a tweet_id {t.id}_0
    # for each url_i in t: add a tweet_id {t.id}_{i}
    # for each url_i in t: add a tweet_id {t.reply_id}_{i}
    ##########
    tweet_ids = set()
    logging.info("create list of tweet_ids")
    for tweet_id, tweet in tqdm(event_data.items(), total=len(event_data)):
        if not tweet.expanded_urls:
            tweet_ids.add(f'{tweet_id}_0')
        else:       
            for i, url in enumerate(tweet.expanded_urls.values()):
                tweet_ids.add(f'{tweet_id}_{i}')
                
        if tweet.reply_id != 'NULL':
            if tweet.reply_id in event_data:
                for i, url in enumerate(tweet.expanded_urls.values()):
                    tweet_ids.add(f'{tweet.reply_id}_{i}')
                    
    ##########
    # for each tweet_id in the set of tweet_ids
    # add a pair
    ##########
    logging.info("create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'")
    replies_amount = 0
    retweets_amount = 0
    quotes_amount = 0
    missing_replies_amount = 0
    pairs = []
    
    for tweet_id in tweet_ids:
        frags = tweet_id.split('_')
        o_tweet_id = frags[0]
        i = int(frags[1])
        
        tweet = event_data[o_tweet_id]
        
        url = tweet.expanded_urls.get(i)
        if url:
            pairs.append((tweet_id, url))
        
        # retweets ARE considered, due to be exact text copies of the retweeted tweet
        if tweet.retweet_id != 'NULL':
            retweets_amount += 1
        if tweet.quote_id != 'NULL':
            quotes_amount += 1
        if tweet.reply_id != 'NULL':
            replies_amount += 1

            if tweet.reply_id in event_data:
                pairs.append((tweet_id, f'{tweet.reply_id}_{i}'))
            else:
                missing_replies_amount += 1
                
    logging.info(f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} '
                 f'(missing: {missing_replies_amount}, missing urls: {missing_urls_amount})')

    ##########

    """
        all keys must be the same time (in this case, strings);
        unionfind will vectorize operations and will cast everything in the array to the same type,
        so if there are integers and strings, it will cast everything to string and comparisons will fail
        when calling uf.components().
    """

    logging.info('applying union-find')
    uf = UnionFind()
    for u, v in pairs:
        uf.union(u, v)
    logging.info(f'total components: {len(uf.components())}')

    return {
        'uf': uf, 
        'event_data': event_data
    }


# event_name: (uf, event_data)
models_uniq_url = {
    'libya': gen_model_uniq_urls('libya_hotel_tweets.tsv'),
    'pistorius': gen_model_uniq_urls('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model_uniq_urls('nepal_tweets.tsv')
}

2018-11-19 11:20:00,353 : load and clean dataset: libya_hotel_tweets.tsv
2018-11-19 11:20:00,511 : tweets processed: 26331, ignored: 2309, missing urls: 6341
2018-11-19 11:20:00,511 : create list of tweet_ids
100%|██████████| 26331/26331 [00:00<00:00, 952292.50it/s]
2018-11-19 11:20:00,541 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:20:00,593 : total pairs: 20127, retweets: 13389, quotes: 0, replies: 313 (missing: 131, missing urls: 6341)
2018-11-19 11:20:00,594 : applying union-find
2018-11-19 11:20:00,756 : total components: 3399
2018-11-19 11:20:00,759 : load and clean dataset: oscar_pistorius_tweets.tsv
2018-11-19 11:20:02,657 : tweets processed: 112260, ignored: 955, missing urls: 21807
2018-11-19 11:20:02,658 : create list of tweet_ids
100%|██████████| 112260/112260 [00:00<00:00, 936434.88it/s]
2018-11-19 11:20:02,780 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:

In [416]:
event_names = ('libya', 'pistorius', 'nepal')

docs2 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

url_indices2 = { 
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}


for event_name in event_names:
    for component in models_uniq_url[event_name]['uf'].components():
        comp_key = []
        comp_ids = []
        for elem in component:
            if elem.startswith('http'):
                comp_key.append(elem)
            else:
                elem_0 = elem.split('_')[0]
                t = models_uniq_url[event_name]['event_data'].get(elem_0)
                if not t:
                    print("err")
                comp_ids.append(t.tweet_id)
        
        # component does not have url
        if not comp_key:
            comp_key.append(np.random.choice(comp_ids))
        
        comp_key = tuple(comp_key)
        key = hash(comp_key)
        
        docs2[event_name][key] = comp_ids
        url_indices2[event_name][key] = comp_key
        
        
vecs2 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# for each event
for event_name, docs_event in tqdm(docs2.items(), total=len(docs)):
    # for each component in this event
    for key, tweet_ids in tqdm(docs_event.items(), total=len(docs_event)):
        vec = []
        texts = [models_uniq_url[event_name]['event_data'][twid].text for twid in tweet_ids]
        
        for tokens in nlp.pipe(texts, n_threads=-1):
            for token in tokens:
                if not token.like_url and token.lower_ in we:
                    v = we[token.lower_]
                    vec.append(v)
        
        if vec:
            avg_vec = np.array(vec).mean(axis=0)
            vecs2[event_name][key] = avg_vec
        else:
            print("no vec")

            
for event_name, vec_info in vecs2.items():
    with open(f'data_local_events/{event_name}_vectors2.tsv', 'w') as f:
        for key, vec in vec_info.items():
            values = "\t".join([str(v) for v in vec])
            f.write(f"{key}\t{values}\n")

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3399 [00:00<?, ?it/s][A
  0%|          | 5/3399 [00:00<01:58, 28.69it/s][A
  0%|          | 15/3399 [00:00<01:54, 29.65it/s][A
  1%|          | 17/3399 [00:00<02:05, 26.99it/s][A
  1%|          | 20/3399 [00:00<02:14, 25.17it/s][A
  1%|          | 22/3399 [00:00<02:24, 23.31it/s][A
  1%|          | 28/3399 [00:01<02:10, 25.75it/s][A
  1%|▏         | 49/3399 [00:01<01:21, 41.15it/s][A
  2%|▏         | 57/3399 [00:01<01:18, 42.82it/s][A
  2%|▏         | 83/3399 [00:01<00:57, 57.84it/s][A
  4%|▎         | 123/3399 [00:01<00:40, 80.13it/s][A
  5%|▍         | 154/3399 [00:01<00:34, 94.18it/s][A
  6%|▌         | 197/3399 [00:01<00:28, 112.76it/s][A
  7%|▋         | 226/3399 [00:01<00:26, 121.31it/s][A
  7%|▋         | 254/3399 [00:02<00:25, 123.88it/s][A
  8%|▊         | 278/3399 [00:02<00:24, 128.84it/s][A
 10%|█         | 344/3399 [00:02<00:20, 152.35it/s][A
 11%|█         | 380/3399 [00:02<00:18, 160.72it/s][A
 13%

no vec



 79%|███████▉  | 7653/9640 [00:16<00:04, 454.26it/s][A
 82%|████████▏ | 7866/9640 [00:16<00:03, 464.14it/s][A
 84%|████████▍ | 8088/9640 [00:17<00:03, 474.43it/s][A
 86%|████████▌ | 8311/9640 [00:17<00:02, 484.65it/s][A
 89%|████████▊ | 8541/9640 [00:17<00:02, 495.17it/s][A
 91%|█████████ | 8795/9640 [00:17<00:01, 506.93it/s][A
 94%|█████████▎| 9022/9640 [00:17<00:01, 516.84it/s][A

no vec



 96%|█████████▌| 9251/9640 [00:17<00:00, 526.93it/s][A
 98%|█████████▊| 9476/9640 [00:17<00:00, 536.48it/s][A
 67%|██████▋   | 2/3 [00:22<00:11, 11.25s/it]72it/s][A
  0%|          | 0/22914 [00:00<?, ?it/s][A
  0%|          | 5/22914 [00:00<08:10, 46.74it/s][A
  0%|          | 8/22914 [00:00<19:38, 19.44it/s][A
  0%|          | 10/22914 [00:01<48:37,  7.85it/s][A
  0%|          | 12/22914 [00:03<1:51:42,  3.42it/s][A
  0%|          | 13/22914 [00:03<1:50:43,  3.45it/s][A
  0%|          | 14/22914 [00:11<5:09:09,  1.23it/s][A
  0%|          | 16/22914 [00:11<4:34:09,  1.39it/s][A
  0%|          | 18/22914 [00:11<4:13:12,  1.51it/s][A
  0%|          | 22/22914 [00:12<3:33:31,  1.79it/s][A
  0%|          | 24/22914 [00:12<3:20:11,  1.91it/s][A
  0%|          | 25/22914 [00:12<3:16:24,  1.94it/s][A
  0%|          | 29/22914 [00:22<4:54:07,  1.30it/s][A
  0%|          | 30/22914 [00:22<4:45:48,  1.33it/s][A
  0%|          | 32/22914 [00:22<4:31:21,  1.41it/s][A
  0%|     

no vec



 18%|█▊        | 4035/22914 [01:05<05:08, 61.26it/s][A
 18%|█▊        | 4086/22914 [01:05<05:03, 61.94it/s][A
 18%|█▊        | 4136/22914 [01:06<04:59, 62.60it/s][A
 18%|█▊        | 4198/22914 [01:06<04:55, 63.44it/s][A
 19%|█▊        | 4251/22914 [01:06<04:51, 64.13it/s][A
 19%|█▉        | 4302/22914 [01:06<04:47, 64.79it/s][A
 19%|█▉        | 4369/22914 [01:06<04:42, 65.69it/s][A
 19%|█▉        | 4423/22914 [01:06<04:38, 66.37it/s][A
 20%|█▉        | 4495/22914 [01:06<04:33, 67.35it/s][A
 20%|█▉        | 4551/22914 [01:06<04:29, 68.06it/s][A
 20%|██        | 4611/22914 [01:06<04:25, 68.83it/s][A
 20%|██        | 4663/22914 [01:07<04:22, 69.48it/s][A
 21%|██        | 4719/22914 [01:07<04:19, 70.21it/s][A
 21%|██        | 4770/22914 [01:07<04:16, 70.84it/s][A

no vec



 21%|██        | 4819/22914 [01:07<04:13, 71.45it/s][A
 21%|██        | 4866/22914 [01:07<04:10, 72.03it/s][A
 21%|██▏       | 4913/22914 [01:07<04:07, 72.61it/s][A
 22%|██▏       | 4961/22914 [01:07<04:05, 73.21it/s][A
 22%|██▏       | 5016/22914 [01:07<04:02, 73.89it/s][A
 22%|██▏       | 5071/22914 [01:07<03:59, 74.59it/s][A
 22%|██▏       | 5125/22914 [01:08<03:56, 75.27it/s][A
 23%|██▎       | 5219/22914 [01:08<03:51, 76.52it/s][A
 23%|██▎       | 5279/22914 [01:08<03:48, 77.27it/s][A
 23%|██▎       | 5337/22914 [01:08<03:45, 78.00it/s][A
 24%|██▎       | 5395/22914 [01:08<03:42, 78.68it/s][A
 24%|██▍       | 5447/22914 [01:08<03:40, 79.31it/s][A
 24%|██▍       | 5508/22914 [01:08<03:37, 80.08it/s][A
 24%|██▍       | 5576/22914 [01:08<03:34, 80.95it/s][A
 25%|██▍       | 5656/22914 [01:08<03:30, 81.99it/s][A
 25%|██▍       | 5720/22914 [01:09<03:27, 82.76it/s][A
 25%|██▌       | 5780/22914 [01:09<03:25, 83.47it/s][A
 25%|██▌       | 5836/22914 [01:09<03:23, 84.13

# Model v3: without single tweets

In [421]:
#### TODO: append _{i} to tweet ids in replies and retweets


logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

def gen_model_uniq_urls_no_single(dataset_name):
    event_data, missing_urls_amount = load_data(dataset_name)

    ##########
    # create set of tweet_ids
    # for a given tweet t:
    # if t does not have urls: add a tweet_id {t.id}_0
    # for each url_i in t: add a tweet_id {t.id}_{i}
    # for each url_i in t: add a tweet_id {t.reply_id}_{i}
    ##########
    tweet_ids = set()
    logging.info("create list of tweet_ids")
    for tweet_id, tweet in tqdm(event_data.items(), total=len(event_data)):
        if not tweet.expanded_urls:
            #tweet_ids.add(f'{tweet_id}_0')
            continue
        else:       
            for i, url in enumerate(tweet.expanded_urls.values()):
                tweet_ids.add(f'{tweet_id}_{i}')
                
        if tweet.reply_id != 'NULL':
            if tweet.reply_id in event_data:
                for i, url in enumerate(tweet.expanded_urls.values()):
                    tweet_ids.add(f'{tweet.reply_id}_{i}')
                    
    ##########
    # for each tweet_id in the set of tweet_ids
    # add a pair
    ##########
    logging.info("create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'")
    replies_amount = 0
    retweets_amount = 0
    quotes_amount = 0
    missing_replies_amount = 0
    pairs = []
    
    for tweet_id in tweet_ids:
        frags = tweet_id.split('_')
        o_tweet_id = frags[0]
        i = int(frags[1])
        
        tweet = event_data[o_tweet_id]
        
        url = tweet.expanded_urls.get(i)
        if url:
            pairs.append((tweet_id, url))
        else:
            continue
        
        # retweets ARE considered, due to be exact text copies of the retweeted tweet
        if tweet.retweet_id != 'NULL':
            retweets_amount += 1
        if tweet.quote_id != 'NULL':
            quotes_amount += 1
        if tweet.reply_id != 'NULL':
            replies_amount += 1

            if tweet.reply_id in event_data:
                pairs.append((tweet_id, f'{tweet.reply_id}_{i}'))
            else:
                missing_replies_amount += 1
                
    logging.info(f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} '
                 f'(missing: {missing_replies_amount}, missing urls: {missing_urls_amount})')

    ##########

    """
        all keys must be the same time (in this case, strings);
        unionfind will vectorize operations and will cast everything in the array to the same type,
        so if there are integers and strings, it will cast everything to string and comparisons will fail
        when calling uf.components().
    """

    logging.info('applying union-find')
    uf = UnionFind()
    for u, v in pairs:
        uf.union(u, v)
    logging.info(f'total components: {len(uf.components())}')

    return {
        'uf': uf, 
        'event_data': event_data
    }


# event_name: (uf, event_data)
models_uniq_url_no_single = {
    'libya': gen_model_uniq_urls_no_single('libya_hotel_tweets.tsv'),
    'pistorius': gen_model_uniq_urls_no_single('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model_uniq_urls_no_single('nepal_tweets.tsv')
}

2018-11-19 11:34:01,800 : load and clean dataset: libya_hotel_tweets.tsv
2018-11-19 11:34:01,942 : tweets processed: 26331, ignored: 2309, missing urls: 6341
2018-11-19 11:34:01,943 : create list of tweet_ids
100%|██████████| 26331/26331 [00:00<00:00, 868227.06it/s]
2018-11-19 11:34:01,975 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:34:02,031 : total pairs: 20034, retweets: 8315, quotes: 0, replies: 113 (missing: 24, missing urls: 6341)
2018-11-19 11:34:02,032 : applying union-find
2018-11-19 11:34:02,193 : total components: 3375
2018-11-19 11:34:02,195 : load and clean dataset: oscar_pistorius_tweets.tsv
2018-11-19 11:34:02,697 : tweets processed: 112260, ignored: 955, missing urls: 21807
2018-11-19 11:34:02,698 : create list of tweet_ids
100%|██████████| 112260/112260 [00:00<00:00, 996131.78it/s]
2018-11-19 11:34:02,812 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:34

In [422]:
event_names = ('libya', 'pistorius', 'nepal')

docs3 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

url_indices3 = { 
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}


for event_name in event_names:
    for component in models_uniq_url_no_single[event_name]['uf'].components():
        comp_key = []
        comp_ids = []
        for elem in component:
            if elem.startswith('http'):
                comp_key.append(elem)
            else:
                elem_0 = elem.split('_')[0]
                t = models_uniq_url_no_single[event_name]['event_data'].get(elem_0)
                if not t:
                    print("err")
                comp_ids.append(t.tweet_id)
        
        # component does not have url
        if not comp_key:
            comp_key.append(np.random.choice(comp_ids))
        
        comp_key = tuple(comp_key)
        key = hash(comp_key)
        
        docs3[event_name][key] = comp_ids
        url_indices3[event_name][key] = comp_key
        
        
vecs3 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# for each event
for event_name, docs_event in tqdm(docs3.items(), total=len(docs)):
    # for each component in this event
    for key, tweet_ids in tqdm(docs_event.items(), total=len(docs_event)):
        vec = []
        texts = [models_uniq_url_no_single[event_name]['event_data'][twid].text for twid in tweet_ids]
        
        for tokens in nlp.pipe(texts, n_threads=-1):
            for token in tokens:
                if not token.like_url and token.lower_ in we:
                    v = we[token.lower_]
                    vec.append(v)
        
        if vec:
            avg_vec = np.array(vec).mean(axis=0)
            vecs3[event_name][key] = avg_vec
        else:
            print("no vec")

            
for event_name, vec_info in vecs3.items():
    with open(f'data_local_events/{event_name}_vectors3.tsv', 'w') as f:
        for key, vec in vec_info.items():
            values = "\t".join([str(v) for v in vec])
            f.write(f"{key}\t{values}\n")

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3375 [00:00<?, ?it/s][A
  0%|          | 6/3375 [00:00<01:27, 38.33it/s][A
  0%|          | 16/3375 [00:00<01:36, 34.79it/s][A
  1%|          | 18/3375 [00:00<01:46, 31.56it/s][A
  1%|          | 25/3375 [00:00<01:30, 37.19it/s][A
  1%|          | 29/3375 [00:00<01:30, 36.99it/s][A
  1%|          | 34/3375 [00:00<01:28, 37.66it/s][A
  2%|▏         | 55/3375 [00:01<01:00, 54.83it/s][A
  2%|▏         | 64/3375 [00:01<00:59, 56.00it/s][A
  3%|▎         | 90/3375 [00:01<00:46, 71.21it/s][A
  4%|▍         | 141/3375 [00:01<00:31, 102.85it/s][A
  6%|▌         | 186/3375 [00:01<00:25, 125.79it/s][A
  6%|▋         | 215/3375 [00:01<00:23, 135.47it/s][A
  7%|▋         | 243/3375 [00:01<00:22, 139.81it/s][A
  8%|▊         | 271/3375 [00:01<00:21, 147.20it/s][A
 10%|█         | 344/3375 [00:02<00:17, 171.32it/s][A
 12%|█▏        | 389/3375 [00:02<00:16, 184.54it/s][A
 14%|█▍        | 465/3375 [00:02<00:13, 210.00it/s][A
 1

no vec



 77%|███████▋  | 7225/9329 [00:15<00:04, 460.44it/s][A
 80%|███████▉  | 7453/9329 [00:15<00:03, 471.93it/s][A
 82%|████████▏ | 7672/9329 [00:15<00:03, 482.75it/s][A
 85%|████████▍ | 7899/9329 [00:15<00:02, 493.92it/s][A
 87%|████████▋ | 8129/9329 [00:16<00:02, 505.11it/s][A
 90%|████████▉ | 8391/9329 [00:16<00:01, 518.14it/s][A
 92%|█████████▏| 8623/9329 [00:16<00:01, 529.02it/s][A

no vec



 95%|█████████▌| 8873/9329 [00:16<00:00, 541.02it/s][A
 98%|█████████▊| 9108/9329 [00:16<00:00, 551.89it/s][A
 67%|██████▋   | 2/3 [00:20<00:10, 10.40s/it]61it/s][A
  0%|          | 0/22493 [00:00<?, ?it/s][A
  0%|          | 7/22493 [00:00<05:26, 68.85it/s][A
  0%|          | 10/22493 [00:01<46:31,  8.05it/s][A
  0%|          | 13/22493 [00:03<1:40:42,  3.72it/s][A
  0%|          | 15/22493 [00:03<1:33:46,  3.99it/s][A
  0%|          | 17/22493 [00:11<4:16:12,  1.46it/s][A
  0%|          | 19/22493 [00:11<3:51:44,  1.62it/s][A
  0%|          | 20/22493 [00:12<3:47:24,  1.65it/s][A
  0%|          | 25/22493 [00:12<3:07:31,  2.00it/s][A
  0%|          | 27/22493 [00:12<2:57:39,  2.11it/s][A
  0%|          | 29/22493 [00:13<2:48:54,  2.22it/s][A
  0%|          | 32/22493 [00:23<4:30:50,  1.38it/s][A
  0%|          | 33/22493 [00:23<4:23:55,  1.42it/s][A
  0%|          | 35/22493 [00:23<4:11:58,  1.49it/s][A
  0%|          | 39/22493 [00:23<3:48:44,  1.64it/s][A
  0%|  

no vec



 18%|█▊        | 3965/22493 [01:05<05:07, 60.33it/s][A
 18%|█▊        | 4012/22493 [01:05<05:03, 60.95it/s][A
 18%|█▊        | 4075/22493 [01:05<04:57, 61.81it/s][A
 18%|█▊        | 4126/22493 [01:06<04:53, 62.49it/s][A
 19%|█▊        | 4185/22493 [01:06<04:49, 63.28it/s][A
 19%|█▉        | 4237/22493 [01:06<04:45, 63.91it/s][A
 19%|█▉        | 4299/22493 [01:06<04:41, 64.75it/s][A
 19%|█▉        | 4350/22493 [01:06<04:37, 65.40it/s][A
 20%|█▉        | 4433/22493 [01:06<04:31, 66.54it/s][A
 20%|█▉        | 4495/22493 [01:06<04:27, 67.37it/s][A
 20%|██        | 4554/22493 [01:06<04:23, 68.14it/s][A
 20%|██        | 4611/22493 [01:06<04:19, 68.88it/s][A
 21%|██        | 4681/22493 [01:07<04:15, 69.82it/s][A

no vec



 21%|██        | 4742/22493 [01:07<04:11, 70.59it/s][A
 21%|██▏       | 4798/22493 [01:07<04:08, 71.27it/s][A
 22%|██▏       | 4849/22493 [01:07<04:05, 71.89it/s][A
 22%|██▏       | 4908/22493 [01:07<04:02, 72.64it/s][A
 22%|██▏       | 4957/22493 [01:07<03:59, 73.24it/s][A
 22%|██▏       | 5004/22493 [01:07<03:56, 73.82it/s][A
 23%|██▎       | 5075/22493 [01:07<03:52, 74.76it/s][A
 23%|██▎       | 5144/22493 [01:07<03:49, 75.66it/s][A
 23%|██▎       | 5217/22493 [01:08<03:45, 76.61it/s][A
 23%|██▎       | 5279/22493 [01:08<03:42, 77.38it/s][A
 24%|██▎       | 5337/22493 [01:08<03:39, 78.09it/s][A
 24%|██▍       | 5392/22493 [01:08<03:37, 78.77it/s][A
 24%|██▍       | 5449/22493 [01:08<03:34, 79.48it/s][A
 25%|██▍       | 5527/22493 [01:08<03:30, 80.49it/s][A
 25%|██▍       | 5591/22493 [01:08<03:27, 81.30it/s][A
 25%|██▌       | 5651/22493 [01:08<03:25, 82.05it/s][A
 25%|██▌       | 5710/22493 [01:08<03:22, 82.76it/s][A
 26%|██▌       | 5767/22493 [01:09<03:20, 83.42

# Model v4: without single tweets, wo replies

In [429]:
#### TODO: append _{i} to tweet ids in replies and retweets


logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

def gen_model_uniq_urls_no_single_no_reply(dataset_name):
    event_data, missing_urls_amount = load_data(dataset_name)

    ##########
    # create set of tweet_ids
    # for a given tweet t:
    # if t does not have urls: add a tweet_id {t.id}_0
    # for each url_i in t: add a tweet_id {t.id}_{i}
    # for each url_i in t: add a tweet_id {t.reply_id}_{i}
    ##########
    tweet_ids = set()
    logging.info("create list of tweet_ids")
    for tweet_id, tweet in tqdm(event_data.items(), total=len(event_data)):
        if not tweet.expanded_urls:
            #tweet_ids.add(f'{tweet_id}_0')
            continue
        else:       
            for i, url in enumerate(tweet.expanded_urls.values()):
                tweet_ids.add(f'{tweet_id}_{i}')
                
        #if tweet.reply_id != 'NULL':
        #    if tweet.reply_id in event_data:
        #        for i, url in enumerate(tweet.expanded_urls.values()):
        #            tweet_ids.add(f'{tweet.reply_id}_{i}')
                    
    ##########
    # for each tweet_id in the set of tweet_ids
    # add a pair
    ##########
    logging.info("create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'")
    replies_amount = 0
    retweets_amount = 0
    quotes_amount = 0
    missing_replies_amount = 0
    pairs = []
    
    for tweet_id in tweet_ids:
        frags = tweet_id.split('_')
        o_tweet_id = frags[0]
        i = int(frags[1])
        
        tweet = event_data[o_tweet_id]
        
        url = tweet.expanded_urls.get(i)
        if url:
            pairs.append((tweet_id, url))
        else:
            continue
        
        # retweets ARE considered, due to be exact text copies of the retweeted tweet
        if tweet.retweet_id != 'NULL':
            retweets_amount += 1
        if tweet.quote_id != 'NULL':
            quotes_amount += 1
        if tweet.reply_id != 'NULL':
            replies_amount += 1

            #if tweet.reply_id in event_data:
            #    pairs.append((tweet_id, f'{tweet.reply_id}_{i}'))
            #else:
            #    missing_replies_amount += 1
                
    logging.info(f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} '
                 f'(missing: {missing_replies_amount}, missing urls: {missing_urls_amount})')

    ##########

    """
        all keys must be the same time (in this case, strings);
        unionfind will vectorize operations and will cast everything in the array to the same type,
        so if there are integers and strings, it will cast everything to string and comparisons will fail
        when calling uf.components().
    """

    logging.info('applying union-find')
    uf = UnionFind()
    for u, v in pairs:
        uf.union(u, v)
    logging.info(f'total components: {len(uf.components())}')

    return {
        'uf': uf, 
        'event_data': event_data
    }


# event_name: (uf, event_data)
models_uniq_url_no_single_no_reply = {
    'libya': gen_model_uniq_urls_no_single_no_reply('libya_hotel_tweets.tsv'),
    'pistorius': gen_model_uniq_urls_no_single_no_reply('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model_uniq_urls_no_single_no_reply('nepal_tweets.tsv')
}

2018-11-19 11:56:01,548 : load and clean dataset: libya_hotel_tweets.tsv
2018-11-19 11:56:01,684 : tweets processed: 26331, ignored: 2309, missing urls: 6341
2018-11-19 11:56:01,684 : create list of tweet_ids
100%|██████████| 26331/26331 [00:00<00:00, 1001198.63it/s]
2018-11-19 11:56:01,712 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:56:01,761 : total pairs: 19945, retweets: 8315, quotes: 0, replies: 113 (missing: 0, missing urls: 6341)
2018-11-19 11:56:01,762 : applying union-find
2018-11-19 11:56:01,917 : total components: 3385
2018-11-19 11:56:01,919 : load and clean dataset: oscar_pistorius_tweets.tsv
2018-11-19 11:56:02,426 : tweets processed: 112260, ignored: 955, missing urls: 21807
2018-11-19 11:56:02,426 : create list of tweet_ids
100%|██████████| 112260/112260 [00:00<00:00, 1068446.38it/s]
2018-11-19 11:56:02,533 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 11:5

In [422]:
event_names = ('libya', 'pistorius', 'nepal')

docs3 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

url_indices3 = { 
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}


for event_name in event_names:
    for component in models_uniq_url_no_single[event_name]['uf'].components():
        comp_key = []
        comp_ids = []
        for elem in component:
            if elem.startswith('http'):
                comp_key.append(elem)
            else:
                elem_0 = elem.split('_')[0]
                t = models_uniq_url_no_single[event_name]['event_data'].get(elem_0)
                if not t:
                    print("err")
                comp_ids.append(t.tweet_id)
        
        # component does not have url
        if not comp_key:
            comp_key.append(np.random.choice(comp_ids))
        
        comp_key = tuple(comp_key)
        key = hash(comp_key)
        
        docs3[event_name][key] = comp_ids
        url_indices3[event_name][key] = comp_key
        
        
vecs3 = {
    'libya': dict(),
    'pistorius': dict(),
    'nepal': dict()
}

# for each event
for event_name, docs_event in tqdm(docs3.items(), total=len(docs)):
    # for each component in this event
    for key, tweet_ids in tqdm(docs_event.items(), total=len(docs_event)):
        vec = []
        texts = [models_uniq_url_no_single[event_name]['event_data'][twid].text for twid in tweet_ids]
        
        for tokens in nlp.pipe(texts, n_threads=-1):
            for token in tokens:
                if not token.like_url and token.lower_ in we:
                    v = we[token.lower_]
                    vec.append(v)
        
        if vec:
            avg_vec = np.array(vec).mean(axis=0)
            vecs3[event_name][key] = avg_vec
        else:
            print("no vec")

            
for event_name, vec_info in vecs3.items():
    with open(f'data_local_events/{event_name}_vectors3.tsv', 'w') as f:
        for key, vec in vec_info.items():
            values = "\t".join([str(v) for v in vec])
            f.write(f"{key}\t{values}\n")

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/3375 [00:00<?, ?it/s][A
  0%|          | 6/3375 [00:00<01:27, 38.33it/s][A
  0%|          | 16/3375 [00:00<01:36, 34.79it/s][A
  1%|          | 18/3375 [00:00<01:46, 31.56it/s][A
  1%|          | 25/3375 [00:00<01:30, 37.19it/s][A
  1%|          | 29/3375 [00:00<01:30, 36.99it/s][A
  1%|          | 34/3375 [00:00<01:28, 37.66it/s][A
  2%|▏         | 55/3375 [00:01<01:00, 54.83it/s][A
  2%|▏         | 64/3375 [00:01<00:59, 56.00it/s][A
  3%|▎         | 90/3375 [00:01<00:46, 71.21it/s][A
  4%|▍         | 141/3375 [00:01<00:31, 102.85it/s][A
  6%|▌         | 186/3375 [00:01<00:25, 125.79it/s][A
  6%|▋         | 215/3375 [00:01<00:23, 135.47it/s][A
  7%|▋         | 243/3375 [00:01<00:22, 139.81it/s][A
  8%|▊         | 271/3375 [00:01<00:21, 147.20it/s][A
 10%|█         | 344/3375 [00:02<00:17, 171.32it/s][A
 12%|█▏        | 389/3375 [00:02<00:16, 184.54it/s][A
 14%|█▍        | 465/3375 [00:02<00:13, 210.00it/s][A
 1

no vec



 77%|███████▋  | 7225/9329 [00:15<00:04, 460.44it/s][A
 80%|███████▉  | 7453/9329 [00:15<00:03, 471.93it/s][A
 82%|████████▏ | 7672/9329 [00:15<00:03, 482.75it/s][A
 85%|████████▍ | 7899/9329 [00:15<00:02, 493.92it/s][A
 87%|████████▋ | 8129/9329 [00:16<00:02, 505.11it/s][A
 90%|████████▉ | 8391/9329 [00:16<00:01, 518.14it/s][A
 92%|█████████▏| 8623/9329 [00:16<00:01, 529.02it/s][A

no vec



 95%|█████████▌| 8873/9329 [00:16<00:00, 541.02it/s][A
 98%|█████████▊| 9108/9329 [00:16<00:00, 551.89it/s][A
 67%|██████▋   | 2/3 [00:20<00:10, 10.40s/it]61it/s][A
  0%|          | 0/22493 [00:00<?, ?it/s][A
  0%|          | 7/22493 [00:00<05:26, 68.85it/s][A
  0%|          | 10/22493 [00:01<46:31,  8.05it/s][A
  0%|          | 13/22493 [00:03<1:40:42,  3.72it/s][A
  0%|          | 15/22493 [00:03<1:33:46,  3.99it/s][A
  0%|          | 17/22493 [00:11<4:16:12,  1.46it/s][A
  0%|          | 19/22493 [00:11<3:51:44,  1.62it/s][A
  0%|          | 20/22493 [00:12<3:47:24,  1.65it/s][A
  0%|          | 25/22493 [00:12<3:07:31,  2.00it/s][A
  0%|          | 27/22493 [00:12<2:57:39,  2.11it/s][A
  0%|          | 29/22493 [00:13<2:48:54,  2.22it/s][A
  0%|          | 32/22493 [00:23<4:30:50,  1.38it/s][A
  0%|          | 33/22493 [00:23<4:23:55,  1.42it/s][A
  0%|          | 35/22493 [00:23<4:11:58,  1.49it/s][A
  0%|          | 39/22493 [00:23<3:48:44,  1.64it/s][A
  0%|  

no vec



 18%|█▊        | 3965/22493 [01:05<05:07, 60.33it/s][A
 18%|█▊        | 4012/22493 [01:05<05:03, 60.95it/s][A
 18%|█▊        | 4075/22493 [01:05<04:57, 61.81it/s][A
 18%|█▊        | 4126/22493 [01:06<04:53, 62.49it/s][A
 19%|█▊        | 4185/22493 [01:06<04:49, 63.28it/s][A
 19%|█▉        | 4237/22493 [01:06<04:45, 63.91it/s][A
 19%|█▉        | 4299/22493 [01:06<04:41, 64.75it/s][A
 19%|█▉        | 4350/22493 [01:06<04:37, 65.40it/s][A
 20%|█▉        | 4433/22493 [01:06<04:31, 66.54it/s][A
 20%|█▉        | 4495/22493 [01:06<04:27, 67.37it/s][A
 20%|██        | 4554/22493 [01:06<04:23, 68.14it/s][A
 20%|██        | 4611/22493 [01:06<04:19, 68.88it/s][A
 21%|██        | 4681/22493 [01:07<04:15, 69.82it/s][A

no vec



 21%|██        | 4742/22493 [01:07<04:11, 70.59it/s][A
 21%|██▏       | 4798/22493 [01:07<04:08, 71.27it/s][A
 22%|██▏       | 4849/22493 [01:07<04:05, 71.89it/s][A
 22%|██▏       | 4908/22493 [01:07<04:02, 72.64it/s][A
 22%|██▏       | 4957/22493 [01:07<03:59, 73.24it/s][A
 22%|██▏       | 5004/22493 [01:07<03:56, 73.82it/s][A
 23%|██▎       | 5075/22493 [01:07<03:52, 74.76it/s][A
 23%|██▎       | 5144/22493 [01:07<03:49, 75.66it/s][A
 23%|██▎       | 5217/22493 [01:08<03:45, 76.61it/s][A
 23%|██▎       | 5279/22493 [01:08<03:42, 77.38it/s][A
 24%|██▎       | 5337/22493 [01:08<03:39, 78.09it/s][A
 24%|██▍       | 5392/22493 [01:08<03:37, 78.77it/s][A
 24%|██▍       | 5449/22493 [01:08<03:34, 79.48it/s][A
 25%|██▍       | 5527/22493 [01:08<03:30, 80.49it/s][A
 25%|██▍       | 5591/22493 [01:08<03:27, 81.30it/s][A
 25%|██▌       | 5651/22493 [01:08<03:25, 82.05it/s][A
 25%|██▌       | 5710/22493 [01:08<03:22, 82.76it/s][A
 26%|██▌       | 5767/22493 [01:09<03:20, 83.42

# Validation

## original model

In [371]:
client = MongoClient(f'mongodb://localhost:27017')
db = client.twitter_news_remote

diff_topics_in_url = defaultdict(set)
for url_id, tweet_ids_str in docs['libya'].items():
    for twid in tweet_ids_str:
        tweet_id = int(twid)
        if tweet_id in tweet_topic:
            topic_id = tweet_topic[tweet_id]
            diff_topics_in_url[url_id].add(topic_id)

for url_id, topics in diff_topics_in_url.items():
    if len(topics) > 1:
        print(len(url_indices['libya'][url_id]), len(docs['libya'][url_id]))
        if len(url_indices['libya'][url_id]) == 1:
            print(url_indices['libya'][url_id][0])
        for topic in topics:
            try:
                topic_m = db.topics.find_one({'_id': ObjectId(topic)})
                print(topic_m['topic_name'])
            except:
                print(topic)
        print()

119 6351
Confrontation with security forces
Context about attack
Hostages are taken
Car bomb explodes
Report of the attack
ISIS adjudicates attack
Report on the amount of casualties

22 766
Report on the amount of casualties
Hostages are taken
Car bomb explodes

1 5
http://twtly.com/so5/
Car bomb explodes
Report on the amount of casualties

3 39
Confrontation with security forces
Car bomb explodes

6 293
Car bomb explodes
Hostages are taken
Report on the amount of casualties

1 14
http://www.tastysuperfoods.com/
Report on the amount of casualties
Car bomb explodes

1 27
http://www.bostonglobe.com/news/world/2015/01/27/gunmen-storm-luxury-hotel-libya-take-hostage/QbhnAl6hKdPFUpdaJFFliK/story.html
Hostages are taken
Report on the amount of casualties

3 16
ISIS adjudicates attack
Hostages are taken



## model v2

In [417]:
client = MongoClient(f'mongodb://localhost:27017')
db = client.twitter_news_remote

for event_name in event_names:
    print(event_name)
    print("#" * 20)
    print()
    diff_topics_in_url = defaultdict(set)
    for url_id, tweet_ids_str in docs2[event_name].items():
        for twid in tweet_ids_str:
            tweet_id = int(twid)
            if tweet_id in tweet_topic:
                topic_id = tweet_topic[tweet_id]
                diff_topics_in_url[url_id].add(topic_id)

    for url_id, topics in diff_topics_in_url.items():
        if len(topics) > 1:
            print("urls:", len(url_indices2[event_name][url_id]), "tweets:", len(docs2[event_name][url_id]))

            #if len(url_indices2['libya'][url_id]) == 1:
            #    print(url_indices2['libya'][url_id][0])

            for u in url_indices2[event_name][url_id]:
                print(u)

            for topic in topics:
                try:
                    topic_m = db.topics.find_one({'_id': ObjectId(topic)})
                    print(topic_m['topic_name'])
                except:
                    print(topic)
            print()
    print()
    print()

libya
####################

urls: 1 tweets: 1057
http://edition.cnn.com/2015/01/27/middleeast/libya-corinthia-hotel-attack/index.html
Report of the attack
Context about attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 118
https://twitter.com/account/suspended
ISIS adjudicates attack
Hostages are taken

urls: 1 tweets: 1801
http://www.bbc.co.uk/news/world-africa-31001094
ISIS adjudicates attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 414
https://www.rt.com/news/226603-libya-tripoli-gunmen-seige/
Car bomb explodes
Report on the amount of casualties

urls: 10 tweets: 736
https://twitter.com/NewsOnTheMin/status/560053596830855168/photo/1
http://www.alwasat.ly/ar/mobile/article
https://twitter.com/charliewinter/status/560009425818484736/photo/1
https://sputniknews.com/middleeast/201501271017397901/
http://www.middleeasteye.net/news/gunmen-kill-guards-and-take-hostages-corinthia-hotel-libyas-tripoli-7

# analysis of largest component

In [418]:
results_component2 = []

# for each event
for event_name, _docs in docs2.items():
    largest_component = max([(component_id, len(component)) for component_id, component in _docs.items()], key=lambda c: c[1])
    
    component_id = largest_component[0]
    component = _docs[component_id]
    
    labels_pred_d = dict()
    labels_true_d = dict()
    for twid in component:
        tweet_id = int(twid)
        if tweet_id in tweet_topic:
            labels_pred_d[tweet_id] = 0  # only 1 cluster/component
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
        
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name="largest component",
        model=f"size={len(component)}",
        k_clusters=1,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )
    results_component2.append(res)
    
df = pd.DataFrame.from_records(results_component2, columns=Result._fields)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,largest component,size=1801,1,6,0.0,-4.692243e-16,-1e-06,-1.279703e-16,1.0,-2.559405e-16,0.632456,1.251629,0.666667
1,pistorius,largest component,size=2673,1,12,0.0,2.45193e-16,2e-06,2.45193e-16,1.0,4.903861e-16,0.685344,0.979869,0.583333
2,nepal,largest component,size=55263,1,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


## model v3

In [425]:
client = MongoClient(f'mongodb://localhost:27017')
db = client.twitter_news_remote

for event_name in event_names:
    print(event_name)
    print("#" * 20)
    print()
    diff_topics_in_url = defaultdict(set)
    for url_id, tweet_ids_str in docs3[event_name].items():
        for twid in tweet_ids_str:
            tweet_id = int(twid)
            if tweet_id in tweet_topic:
                topic_id = tweet_topic[tweet_id]
                diff_topics_in_url[url_id].add(topic_id)

    for url_id, topics in diff_topics_in_url.items():
        if len(topics) > 1:
            print("urls:", len(url_indices3[event_name][url_id]), "tweets:", len(docs3[event_name][url_id]))

            #if len(url_indices2['libya'][url_id]) == 1:
            #    print(url_indices2['libya'][url_id][0])

            for u in url_indices3[event_name][url_id]:
                print(u)

            for topic in topics:
                try:
                    topic_m = db.topics.find_one({'_id': ObjectId(topic)})
                    print(topic_m['topic_name'])
                except:
                    print(topic)
            print()
    print()
    print()

libya
####################

urls: 1 tweets: 1054
http://edition.cnn.com/2015/01/27/middleeast/libya-corinthia-hotel-attack/index.html
Report of the attack
Context about attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 118
https://twitter.com/account/suspended
ISIS adjudicates attack
Hostages are taken

urls: 1 tweets: 1800
http://www.bbc.co.uk/news/world-africa-31001094
ISIS adjudicates attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 412
https://www.rt.com/news/226603-libya-tripoli-gunmen-seige/
Car bomb explodes
Report on the amount of casualties

urls: 3 tweets: 332
https://www.wsj.com/articles/car-bomb-explodes-outside-luxury-hotel-in-libyas-capital-city-of-tripoli-1422351120
https://twitter.com/NewsOnTheMin/status/560015797469581312/photo/1
https://twitter.com/NewsOnTheMin/status/559994589906739200/photo/1
Report on the amount of casualties
Car bomb explodes

urls: 2 tweets: 396
https://www.l

# analysis of largest component v3

In [426]:
results_component3 = []

# for each event
for event_name, _docs in docs3.items():
    largest_component = max([(component_id, len(component)) for component_id, component in _docs.items()], key=lambda c: c[1])
    
    component_id = largest_component[0]
    component = _docs[component_id]
    
    labels_pred_d = dict()
    labels_true_d = dict()
    for twid in component:
        tweet_id = int(twid)
        if tweet_id in tweet_topic:
            labels_pred_d[tweet_id] = 0  # only 1 cluster/component
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
        
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name="largest component",
        model=f"size={len(component)}",
        k_clusters=1,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )
    results_component3.append(res)
    
df = pd.DataFrame.from_records(results_component3, columns=Result._fields)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,largest component,size=1800,1,6,0.0,-4.692243e-16,-1e-06,-1.279703e-16,1.0,-2.559405e-16,0.632456,1.251629,0.666667
1,pistorius,largest component,size=2673,1,12,0.0,2.45193e-16,2e-06,2.45193e-16,1.0,4.903861e-16,0.685344,0.979869,0.583333
2,nepal,largest component,size=55171,1,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [427]:
Result = namedtuple('Result', 'event_name method_name model k_clusters labels_size adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score entropy purity')

results_model = []

clustering_files = sorted(Path('data_local_events/clustering_results/').glob('*.tsv'))

for cl_file in clustering_files:
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
    _cluster_data = dict()
    
    ##### read cluster file
    with cl_file.open() as f:
        for line in f:
            tokens = line.split('\t')
            key, cluster_id = int(tokens[0]), int(tokens[1][:-1])  # hash(urls)\tcluster_id\n
            _cluster_data[key] = cluster_id

    ###### labels_true_d
    """
    las componentes sacadas de uf no consideran componentes sin url
    event_data tiene a todos los tweets
    en la evaluacion hubo tweets sin URL etiquetados
    por lo tanto, este loop saca más tweets etiquetados que restringirlos a solo los del clustering 
    (1000 vs 400 en libya)
    
    for tweet_id_str in models[event_name]['event_data']:
        tweet_id = int(tweet_id_str)
        if tweet_id in tweet_topic:
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
    """
    
    ###### labels
    labels_true_d = dict()
    labels_pred_d = dict()
    for key, tweet_ids_str in docs[event_name].items():
        cluster_id = _cluster_data.get(key)
        if not cluster_id:
            continue
        for twid_str in tweet_ids_str:
            tweet_id = int(twid_str)
            if tweet_id in tweet_topic:
                labels_pred_d[tweet_id] = cluster_id
                labels_true_d[tweet_id] = tweet_topic[tweet_id]
                
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name=method_name,
        model="model",
        k_clusters=k_clusters,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )

    results_model.append(res)

In [428]:
df = pd.DataFrame.from_records(results_raw + results_model, columns=Result._fields)
df.to_csv('/home/mquezada/news-model-git/news-model/data_local_events/clustering_results/results.csv', index=False)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,hc-euclidean-ward,baseline,12,483,0.141648,0.343187,0.378428,0.360146,0.397638,0.377965,0.379478,1.454196,0.592133
1,pistorius,kmeans,baseline,5,56,0.215724,0.291890,0.473205,0.400672,0.558870,0.466730,0.373253,1.726743,0.500000
2,pistorius,hc-euclidean-ward,baseline,4,56,0.280626,0.342512,0.547862,0.414613,0.723934,0.527255,0.465003,1.686575,0.517857
3,libya,kmeans,baseline,7,483,0.452592,0.446485,0.497683,0.455783,0.543434,0.495764,0.598191,1.236841,0.687371
4,nepal,kmeans,baseline,4,45,0.742925,0.698187,0.798194,0.726694,0.876729,0.794692,0.820283,0.552616,0.844444
5,pistorius,hc-euclidean-ward,baseline,2,56,0.148371,0.182682,0.372794,0.218000,0.637502,0.324898,0.402747,2.253044,0.321429
6,nepal,kmeans,baseline,9,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
7,nepal,hc-euclidean-ward,baseline,12,45,0.865487,0.768323,0.862767,0.924204,0.805415,0.860730,0.899118,0.153258,0.955556
8,nepal,kmeans,baseline,10,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
9,libya,hc-euclidean-ward,baseline,10,483,0.181497,0.336665,0.386857,0.351164,0.426177,0.385051,0.415540,1.474608,0.592133


## model v3: without single tweets

In [408]:
client = MongoClient(f'mongodb://localhost:27017')
db = client.twitter_news_remote

for event_name in event_names:
    print(event_name)
    print("#" * 20)
    print()
    diff_topics_in_url = defaultdict(set)
    for url_id, tweet_ids_str in docs2[event_name].items():
        for twid in tweet_ids_str:
            tweet_id = int(twid)
            if tweet_id in tweet_topic:
                topic_id = tweet_topic[tweet_id]
                diff_topics_in_url[url_id].add(topic_id)

    for url_id, topics in diff_topics_in_url.items():
        if len(topics) > 1:
            print("urls:", len(url_indices2[event_name][url_id]), "tweets:", len(docs2[event_name][url_id]))

            #if len(url_indices2['libya'][url_id]) == 1:
            #    print(url_indices2['libya'][url_id][0])

            for u in url_indices2[event_name][url_id]:
                print(u)

            for topic in topics:
                try:
                    topic_m = db.topics.find_one({'_id': ObjectId(topic)})
                    print(topic_m['topic_name'])
                except:
                    print(topic)
            print()
    print()
    print()

libya
####################

urls: 1 tweets: 1057
http://edition.cnn.com/2015/01/27/middleeast/libya-corinthia-hotel-attack/index.html
Report of the attack
Context about attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 118
https://twitter.com/account/suspended
ISIS adjudicates attack
Hostages are taken

urls: 1 tweets: 1801
http://www.bbc.co.uk/news/world-africa-31001094
ISIS adjudicates attack
Confrontation with security forces
Report on the amount of casualties

urls: 1 tweets: 414
https://www.rt.com/news/226603-libya-tripoli-gunmen-seige/
Car bomb explodes
Report on the amount of casualties

urls: 10 tweets: 736
https://twitter.com/NewsOnTheMin/status/560053596830855168/photo/1
http://www.alwasat.ly/ar/mobile/article
https://twitter.com/charliewinter/status/560009425818484736/photo/1
https://sputniknews.com/middleeast/201501271017397901/
http://www.middleeasteye.net/news/gunmen-kill-guards-and-take-hostages-corinthia-hotel-libyas-tripoli-7

# analysis of largest component

In [405]:
results_component2 = []

# for each event
for event_name, _docs in docs2.items():
    largest_component = max([(component_id, len(component)) for component_id, component in _docs.items()], key=lambda c: c[1])
    
    component_id = largest_component[0]
    component = _docs[component_id]
    
    labels_pred_d = dict()
    labels_true_d = dict()
    for twid in component:
        tweet_id = int(twid)
        if tweet_id in tweet_topic:
            labels_pred_d[tweet_id] = 0  # only 1 cluster/component
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
        
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name="largest component",
        model=f"size={len(component)}",
        k_clusters=1,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )
    results_component2.append(res)
    
df = pd.DataFrame.from_records(results_component2, columns=Result._fields)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,largest component,size=1801,1,6,0.0,-4.692243e-16,-1e-06,-1.279703e-16,1.0,-2.559405e-16,0.632456,1.251629,0.666667
1,pistorius,largest component,size=2673,1,12,0.0,2.45193e-16,2e-06,2.45193e-16,1.0,4.903861e-16,0.685344,0.979869,0.583333
2,nepal,largest component,size=55263,1,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


In [409]:
Result = namedtuple('Result', 'event_name method_name model k_clusters labels_size adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score entropy purity')

results_model = []

clustering_files = sorted(Path('data_local_events/clustering_results/').glob('*.tsv'))

for cl_file in clustering_files:
    # e.g. libya_kmeans_11.tsv
    event_name, method_name, k_clusters = cl_file.name.split('_')
    k_clusters = int(k_clusters.split('.')[0])   # e.g. "11.tsv"
    _cluster_data = dict()
    
    ##### read cluster file
    with cl_file.open() as f:
        for line in f:
            tokens = line.split('\t')
            key, cluster_id = int(tokens[0]), int(tokens[1][:-1])  # hash(urls)\tcluster_id\n
            _cluster_data[key] = cluster_id

    ###### labels_true_d
    """
    las componentes sacadas de uf no consideran componentes sin url
    event_data tiene a todos los tweets
    en la evaluacion hubo tweets sin URL etiquetados
    por lo tanto, este loop saca más tweets etiquetados que restringirlos a solo los del clustering 
    (1000 vs 400 en libya)
    
    for tweet_id_str in models[event_name]['event_data']:
        tweet_id = int(tweet_id_str)
        if tweet_id in tweet_topic:
            labels_true_d[tweet_id] = tweet_topic[tweet_id]
    """
    
    ###### labels
    labels_true_d = dict()
    labels_pred_d = dict()
    for key, tweet_ids_str in docs[event_name].items():
        cluster_id = _cluster_data.get(key)
        if not cluster_id:
            continue
        for twid_str in tweet_ids_str:
            tweet_id = int(twid_str)
            if tweet_id in tweet_topic:
                labels_pred_d[tweet_id] = cluster_id
                labels_true_d[tweet_id] = tweet_topic[tweet_id]
                
    labels_true = []
    labels_pred = []
    for tweet_id, topic_id in labels_true_d.items():
        labels_true.append(topic_id)
        labels_pred.append(labels_pred_d[tweet_id])
        
    ari = metrics.adjusted_rand_score(labels_true, labels_pred)
    ami = metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    hom = metrics.homogeneity_score(labels_true, labels_pred)
    com = metrics.completeness_score(labels_true, labels_pred)
    v_m = metrics.v_measure_score(labels_true, labels_pred)
    f_m = metrics.fowlkes_mallows_score(labels_true, labels_pred)
    ent = entropy2(labels_true, labels_pred)
    pur = purity2(labels_true, labels_pred)
    
    #'event_name method_name k_clusters adjusted_rand_index adjusted_mi nmi homogeneity completeness v_measure fm_score'
    res = Result(
        event_name=event_name,
        method_name=method_name,
        model="model",
        k_clusters=k_clusters,
        labels_size=len(labels_true),
        adjusted_rand_index=ari,
        adjusted_mi=ami,
        nmi=nmi,
        homogeneity=hom,
        completeness=com,
        v_measure=v_m,
        fm_score=f_m,
        entropy=ent,
        purity=pur
    )

    results_model.append(res)

In [411]:
df = pd.DataFrame.from_records(results_raw + results_model, columns=Result._fields)
df.to_csv('/home/mquezada/news-model-git/news-model/data_local_events/clustering_results/results.csv', index=False)
df

Unnamed: 0,event_name,method_name,model,k_clusters,labels_size,adjusted_rand_index,adjusted_mi,nmi,homogeneity,completeness,v_measure,fm_score,entropy,purity
0,libya,hc-euclidean-ward,baseline,12,483,0.141648,0.343187,0.378428,0.360146,0.397638,0.377965,0.379478,1.454196,0.592133
1,pistorius,kmeans,baseline,5,56,0.215724,0.291890,0.473205,0.400672,0.558870,0.466730,0.373253,1.726743,0.500000
2,pistorius,hc-euclidean-ward,baseline,4,56,0.280626,0.342512,0.547862,0.414613,0.723934,0.527255,0.465003,1.686575,0.517857
3,libya,kmeans,baseline,7,483,0.452592,0.446485,0.497683,0.455783,0.543434,0.495764,0.598191,1.236841,0.687371
4,nepal,kmeans,baseline,4,45,0.742925,0.698187,0.798194,0.726694,0.876729,0.794692,0.820283,0.552616,0.844444
5,pistorius,hc-euclidean-ward,baseline,2,56,0.148371,0.182682,0.372794,0.218000,0.637502,0.324898,0.402747,2.253044,0.321429
6,nepal,kmeans,baseline,9,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
7,nepal,hc-euclidean-ward,baseline,12,45,0.865487,0.768323,0.862767,0.924204,0.805415,0.860730,0.899118,0.153258,0.955556
8,nepal,kmeans,baseline,10,45,0.865494,0.788568,0.880615,0.939697,0.825247,0.878761,0.898690,0.121930,0.977778
9,libya,hc-euclidean-ward,baseline,10,483,0.181497,0.336665,0.386857,0.351164,0.426177,0.385051,0.415540,1.474608,0.592133


In [None]:
tweet_topic