In [10]:
from pymongo import MongoClient
from web.mongo_remote_password import user, password, ip
from pprint import pprint
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import pandas as pd
from unionfind import UnionFind
import logging

from gensim.models import KeyedVectors
from pathlib import Path
import json
import spacy
from gen_model import gen_model, load_data

from pathlib import Path
from bson.objectid import ObjectId
from sklearn import metrics
from collections import namedtuple

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)

%matplotlib inline

#client = MongoClient(f'mongodb://{user}:{password}@{ip}:27017')
#db = client.twitter_news

topic_labeling_results_path = Path('topic_labeling_results/')

2018-11-19 16:24:50,356 : loading short urls
2018-11-19 16:24:58,057 : loaded 1532034 urls
2018-11-19 16:24:58,058 : loading expanded urls
2018-11-19 16:25:02,955 : loaded 2214565 urls
2018-11-19 16:25:02,956 : cleaning url residual info


In [11]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])
we = KeyedVectors.load_word2vec_format('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')

2018-11-19 16:25:45,332 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-11-19 16:26:48,987 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec


# Model

In [10]:
def gen_model(dataset_name, ignore_wo_url=False, ignore_replies=False):   
    event_data, missing_urls_amount = load_data(dataset_name)

    ##########
    # create set of tweet_ids
    # for a given tweet t:
    # if t does not have urls: add a tweet_id {t.id}_0
    # for each url_i in t: add a tweet_id {t.id}_{i}
    # for each url_i in t: add a tweet_id {t.reply_id}_{i}
    ##########
    tweet_ids = set()
    logging.info("create list of tweet_ids")
    
    for tweet_id, tweet in tqdm(event_data.items(), total=len(event_data)):
        added = False
        if not tweet.expanded_urls:
            if not ignore_wo_url:
                tweet_ids.add(f'{tweet_id}_0')
                added = True
        else:       
            for i, url in enumerate(tweet.expanded_urls.values()):
                tweet_ids.add(f'{tweet_id}_{i}')
                added = True
                
        if added and tweet.reply_id != 'NULL':
            if tweet.reply_id in event_data and not ignore_replies:
                for i, url in enumerate(tweet.expanded_urls.values()):
                    tweet_ids.add(f'{tweet.reply_id}_{i}')
                    
    ##########
    # for each tweet_id in the set of tweet_ids
    # add a pair
    ##########
    logging.info("create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'")
    replies_amount = 0
    retweets_amount = 0
    quotes_amount = 0
    missing_replies_amount = 0
    pairs = []
    
    for tweet_id in tweet_ids:
        frags = tweet_id.split('_')
        o_tweet_id = frags[0]
        i = int(frags[1])
        
        tweet = event_data[o_tweet_id]
        
        url = tweet.expanded_urls.get(i)
        if url:
            pairs.append((tweet_id, url))
        
        # retweets ARE considered, due to be exact text copies of the retweeted tweet
        if tweet.retweet_id != 'NULL':
            retweets_amount += 1
        if tweet.quote_id != 'NULL':
            quotes_amount += 1
        if tweet.reply_id != 'NULL':
            replies_amount += 1

            if tweet.reply_id in event_data:
                if not ignore_replies:
                    pairs.append((tweet_id, f'{tweet.reply_id}_{i}'))
            else:
                missing_replies_amount += 1
                
    logging.info(f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} '
                 f'(missing: {missing_replies_amount}, missing urls: {missing_urls_amount})')

    ##########

    """
        all keys must be the same time (in this case, strings);
        unionfind will vectorize operations and will cast everything in the array to the same type,
        so if there are integers and strings, it will cast everything to string and comparisons will fail
        when calling uf.components().
    """

    logging.info('applying union-find')
    uf = UnionFind()
    for u, v in pairs:
        uf.union(u, v)
    logging.info(f'total components: {len(uf.components())}')
    logging.info('\n')

    return {
        'uf': uf, 
        'event_data': event_data
    }


# event_name: (uf, event_data)
models = {
    'libya': gen_model('libya_hotel_tweets.tsv'),
    'pistorius': gen_model('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model('nepal_tweets.tsv'),
    
    'libya_no_url': gen_model('libya_hotel_tweets.tsv', ignore_wo_url=True),
    'pistorius_no_url': gen_model('oscar_pistorius_tweets.tsv', ignore_wo_url=True),
    'nepal_no_url': gen_model('nepal_tweets.tsv', ignore_wo_url=True),
    
    'libya_no_rep': gen_model('libya_hotel_tweets.tsv', ignore_replies=True),
    'pistorius_no_rep': gen_model('oscar_pistorius_tweets.tsv', ignore_replies=True),
    'nepal_no_rep': gen_model('nepal_tweets.tsv', ignore_replies=True),
    
    'libya_no_url_no_rep': gen_model('libya_hotel_tweets.tsv', ignore_wo_url=True, ignore_replies=True),
    'pistorius_no_url_no_rep': gen_model('oscar_pistorius_tweets.tsv', ignore_wo_url=True, ignore_replies=True),
    'nepal_no_url_no_rep': gen_model('nepal_tweets.tsv', ignore_wo_url=True, ignore_replies=True)
}

2018-11-19 12:20:31,020 : load and clean dataset: libya_hotel_tweets.tsv
2018-11-19 12:20:31,165 : tweets processed: 26331, ignored: 2309, missing urls: 6341
2018-11-19 12:20:31,166 : create list of tweet_ids
100%|██████████| 26331/26331 [00:00<00:00, 799445.65it/s]
2018-11-19 12:20:31,201 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-11-19 12:20:31,255 : total pairs: 20127, retweets: 13389, quotes: 0, replies: 313 (missing: 131, missing urls: 6341)
2018-11-19 12:20:31,255 : applying union-find
2018-11-19 12:20:31,411 : total components: 3399
2018-11-19 12:20:31,411 : 

2018-11-19 12:20:31,413 : load and clean dataset: oscar_pistorius_tweets.tsv
2018-11-19 12:20:31,894 : tweets processed: 112260, ignored: 955, missing urls: 21807
2018-11-19 12:20:31,894 : create list of tweet_ids
100%|██████████| 112260/112260 [00:00<00:00, 938171.98it/s]
2018-11-19 12:20:32,016 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retwe

In [12]:
# size of largest components
for _ename, _model in models.items():
    print(_ename)
    sizes = sorted([len(x) for x in _model['uf'].components()], reverse=True)
    print(sizes[:15])
    print()

libya
[1802, 1058, 746, 653, 415, 398, 385, 237, 236, 187, 186, 148, 145, 140, 125]

pistorius
[2674, 1003, 873, 766, 748, 720, 692, 619, 490, 464, 423, 408, 405, 405, 399]

nepal
[55309, 45063, 12120, 4439, 3885, 3161, 3067, 2640, 2607, 2482, 2304, 2299, 2246, 2170, 2138]

libya_no_url
[1801, 1055, 664, 653, 415, 398, 385, 237, 236, 187, 186, 148, 145, 139, 125]

pistorius_no_url
[2674, 998, 873, 766, 748, 720, 690, 618, 490, 464, 423, 408, 405, 405, 399]

nepal_no_url
[55249, 45023, 12116, 4439, 3867, 3159, 3063, 2639, 2605, 2482, 2304, 2297, 2246, 2170, 2136]

libya_no_rep
[1801, 1055, 653, 413, 385, 375, 273, 237, 236, 187, 186, 148, 145, 139, 127]

pistorius_no_rep
[2674, 998, 832, 766, 748, 720, 690, 618, 490, 464, 423, 408, 405, 405, 399]

nepal_no_rep
[22269, 14115, 11147, 7947, 6699, 6335, 4688, 4439, 3994, 3936, 3134, 3102, 3061, 2869, 2718]

libya_no_url_no_rep
[1801, 1055, 653, 413, 385, 375, 273, 237, 236, 187, 186, 148, 145, 139, 127]

pistorius_no_url_no_rep
[2674, 998, 

In [14]:
tweet_topic = dict()
with open('topic_labeling_results/tweet_topic.tsv') as f:
    for line in f:
        tweet_id, topic_id = line.split('\t')
        tweet_topic[tweet_id] = topic_id[:-1]
        
len(tweet_topic)

1654

# gen docs, compute vectors 

In [None]:
docs = {}

for _ename, _model in models.items():
    logging.info(_ename)
    
    component_key = []
    component_values = []
    
    for component in _model['uf'].components():
        for element in component:
            if element.startswith('http:'):
                component_key.append(element)
            else:
                tweet = _model['event_data'][element]
                component_values.append(tweet.tweet_id)
                
    if not component_key:
        

In [2]:
from word2vec_twitter.word2vecReader import Word2Vec

w2v = Word2Vec.load_word2vec_format('word2vec_twitter_model.bin', binary=True)

In [22]:
inp = 'libya'

for t, s, _ in w2v.most_similar(inp):
    print((t, s))
    
we.most_similar(inp)

('iran', 0.6382494568824768)
('afganistan', 0.6247891187667847)
('iraq', 0.6207890510559082)
('bosnia', 0.6153123378753662)
('egypt', 0.6109287738800049)
('waziristan', 0.6071416139602661)
('kosovo', 0.6057484745979309)
('algeria', 0.604259729385376)
('rwanda', 0.6036934852600098)
('Sirte', 0.6035150289535522)


[('#libya', 0.8835678696632385),
 ('inlibya', 0.8677065372467041),
 ('liby', 0.8656959533691406),
 ('nlibya', 0.8640224933624268),
 ('-libya', 0.8595587611198425),
 ('libyan', 0.8530803918838501),
 ('libya\\', 0.8520067930221558),
 ('libyas', 0.8493448495864868),
 ('libyavoic', 0.8482226133346558),
 ('libyalibyas', 0.8432244062423706)]

2018-11-19 16:27:05,245 : precomputing L2-norms of word weight vectors


[('pistorius-', 0.9676041007041931),
 ('pistoriuss', 0.9662495851516724),
 ('-pistorius', 0.9560325145721436),
 ('#pistorius', 0.9548238515853882),
 ('pistoriu', 0.954453706741333),
 ('sca.#pistorius', 0.9541462659835815),
 ('pistori', 0.9444199204444885),
 ('#pistoriuss', 0.9405568838119507),
 ('oscarpistorius', 0.9321249723434448),
 ('cnnpistorius', 0.9316268563270569)]