In [57]:
from pymongo import MongoClient
from bson.objectid import ObjectId
from collections import OrderedDict
import random
import logging
from datetime import datetime
from functools import lru_cache
from paths import data_path
from pathlib import Path
from gensim.models import KeyedVectors
from tqdm import tqdm
from pprint import pprint
import spacy
from nltk.corpus import stopwords
import pyemd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

client = MongoClient('mongodb://localhost:27017')
db = client.twitter_news

counters = dict()

total_events = 3

@lru_cache(maxsize=total_events)
def get_representatives(event_id):
    _info("getting representatives")
    representatives = db.representatives.find({'event': ObjectId(event_id)})
    return list(representatives)


@lru_cache(maxsize=total_events)
def get_topics(event_id):
    _info("getting topics")
    topics = list(db.topics.find({'event': ObjectId(event_id)}))
    for t in topics:
        if t['topic_name'] == "Non relevant":
            comodin = t
            topics.remove(t)
            break
    return topics, comodin


@lru_cache(maxsize=1)
def get_events():
    _info("getting events")
    events = db.events.find()
    return list(events)


@lru_cache(maxsize=1)
def get_tweets(a=None):
    _info('getting all tweets')
    all_tweets = db.tweets.find()
    return list(all_tweets)


@lru_cache(maxsize=3)
def get_vectors(path):
    _info(f"loading fasttext vectors from {path}")
    word_vectors = KeyedVectors.load_word2vec_format(path)
    return word_vectors

In [9]:
nlp = spacy.load('en_core_web_sm', tagger=False, entity=False, matcher=False)

def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index, token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc

nlp.add_pipe(hashtag_pipe)

In [10]:
path = data_path / Path('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')
ft_comp = get_vectors(path.as_posix())

path = data_path / Path('/home/mquezada/anchor-text-twitter/data/ft_all_tweets_1line1tweet.vec')
ft_all = get_vectors(path.as_posix())

path = data_path / Path('/home/mquezada/anchor-text-twitter/data/w2v_all_tweets.txt')
w2v_all = get_vectors(path.as_posix())

path = data_path / Path('/home/mquezada/anchor-text-twitter/data/w2v_all_tweets_1_line_1_component.txt')
w2v_comp = get_vectors(path.as_posix())

2018-06-20 15:46:20,163 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-20 15:46:20,164 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-20 15:47:22,853 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-20 15:47:22,854 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/ft_all_tweets_1line1tweet.vec
2018-06-20 15:47:22,854 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_all_tweets_1line1tweet.vec
2018-06-20 15:48:29,089 : loaded (1157600, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_all_tweets_1line1tweet.vec
2018-06-20 15:48:29,090 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/w2v_all_tweets.txt
2018-06-20 15:48:29,090 : loading projection weights from /home/mquezada/anchor-text-twitter/data/w2v_all_tweets.txt
2018-06-20 15:51:27,5

In [41]:
w = 'iphone'
w2 = 'earthquake'

print("ft_comp")
pprint(ft_comp.similarity(w, w2))
print()
print("ft_all")
pprint(ft_all.similarity(w, w2))
print()
print("w2v_comp")
pprint(w2v_comp.similarity(w, w2))
print()
print("w2v_all")
pprint(w2v_all.similarity(w, w2))


ft_comp
0.33911799442295903

ft_all
0.3300593981594927

w2v_comp
0.09744087254883523

w2v_all
0.1679580041090767


---

Obtener los tweets del evento y crear mapa de representantes-tweets

In [42]:
all_tweets = get_tweets()
events = get_events()
event = events[0]

representatives = get_representatives(event['_id'])
topics, _ = get_topics(event['_id'])

rep_tweet = dict()
for t in tqdm(all_tweets):
    rep_tweet[t['representative']] = t

rep_set = set([r['_id'] for r in representatives])

2018-06-20 16:47:51,169 : getting all tweets
2018-06-20 16:47:57,164 : getting events
2018-06-20 16:47:57,165 : getting representatives
2018-06-20 16:47:57,269 : getting topics
100%|██████████| 642251/642251 [00:00<00:00, 1273338.48it/s]


In [48]:
# asociacion entre representativos y tweets

tweets_this_event = [t for r, t in rep_tweet.items() if r in rep_set]
len(tweets_this_event)

14233

Listar topicos:

In [49]:
for topic in topics:
    print(topic['topic_name'])

Car bomb explodes
ISIS adjudicates attack
Report on the amount of casualties
Hostages are taken
Report on the number of attackers
Confrontation with security forces


Generar tokens por cada tweet en `tweet_tokens`

In [68]:
# gen tokens for each tweet in the event

tweets_tokens = set()
for doc in tqdm(nlp.pipe(map(lambda t: t['text'], tweets_this_event), n_threads=8, batch_size=1024), 
                total=len(tweets_this_event)):
    
    tokens = frozenset([token.lower_ 
                        for token in doc 
                        if token.lower_ not in stopwords.words('english') and token.lower_ in vectors])
    
    if tokens: 
        tweets_tokens.add(tokens)

100%|██████████| 14233/14233 [00:46<00:00, 308.91it/s]


In [97]:
def sim1(tokens_a, tokens_b):
    ta = [t for t in tokens_a if t in ft_all]
    tb = [t for t in tokens_b if t in ft_all]
    return ft_all.n_similarity(ta, tb)

def sim2(tokens_a, tokens_b):
    ta = [t for t in tokens_a if t in ft_comp]
    tb = [t for t in tokens_b if t in ft_comp]
    return ft_comp.n_similarity(ta, tb)

def sim3(tokens_a, tokens_b):
    ta = [t for t in tokens_a if t in w2v_all]
    tb = [t for t in tokens_b if t in w2v_all]
    return w2v_all.n_similarity(ta, tb)

def sim4(tokens_a, tokens_b):
    ta = [t for t in tokens_a if t in w2v_comp]
    tb = [t for t in tokens_b if t in w2v_comp]
    return w2v_comp.n_similarity(ta, tb)

def mmr(docs, q, lambda_, sim):
    selected = OrderedDict()
    while set(selected) != docs:
        remaining = docs - set(selected)
        mmr_score = lambda x: lambda_ * sim(x, q) - (1 - lambda_) * max([sim(x, y) for y in set(selected) - {x}] or [0])
        next_selected = argmax(remaining, mmr_score)
        selected[next_selected] = len(selected)
        yield selected   

def argmax(keys, f):
    return max(keys, key=f)

In [105]:
lambda_ = 0.5

for topic in topics:
    query = set(map(lambda s: s.lower(), topic['topic_name'].split()))

    print(query)
    print()
    
    i = 0
    print("ft_all")
    for doc in mmr(tweets_tokens, query, lambda_, sim1):
        print(' '.join(list(doc.items())[-1][0]))
        i += 1
        if i == 10:
            break

    i = 0
    print("\nft_comp")
    for doc in mmr(tweets_tokens, query, lambda_, sim2):
        print(' '.join(list(doc.items())[-1][0]))
        i += 1
        if i == 10:
            break

    i = 0
    print("\nw2v_all")
    for doc in mmr(tweets_tokens, query, lambda_, sim3):
        print(' '.join(list(doc.items())[-1][0]))
        i += 1
        if i == 10:
            break

    i = 0
    print("\nw2v_comp")
    for doc in mmr(tweets_tokens, query, lambda_, sim4):
        print(' '.join(list(doc.items())[-1][0]))
        i += 1
        if i == 10:
            break
    
    print()
    print()

{'car', 'bomb', 'explodes'}

ft_all
park corinthia car bomb explodes
woooooah
via ignorant shit
#cars luxury sleep $ 85 tesla model
3 kills hotel car bomb outside
hotel tripoli car dlvr bomb outside explodes
#tripoli middle east hotel car bomb outside explodes
libya isis car claims bomb
libya witnesses hotel car bomb outside explodes capital
hotel tripoli car bomb outside explodes

ft_comp
park corinthia car bomb explodes
libya witnesses hotel car bomb outside explodes capital
#cars luxury sleep $ 85 tesla model
via ignorant shit
post hotel new tripoli car bomb outside explodes
hotel tripoli car dlvr bomb outside explodes
#tripoli middle east hotel car bomb outside explodes
hotel tripoli car bomb outside explodes
3 kills hotel car bomb outside
libya isis car claims bomb

w2v_all
park corinthia car bomb explodes
rt inspiration
libya isis car claims bomb
libya witnesses hotel car bomb outside explodes capital
upgrade limited suite sweet moment deal package
building weirdest ever
newsworl

barack thanks hussein
amit dear rates check tariff
abu attacked name operative anas intelligence al libi
enough brave
menace going global
peace religion
blue read tile perfection signature find quest extraordinary teal
rt
profile company community created owler

w2v_all
reporting #tripoli rt 's news hotel local important multiple corinthia also gunmen hostage
's
visited last family year one 're
essential trump card 's tell comfort us sign
purse 's people world friendly rated hotels
near story posts project huge opening may renovation 2015
baccarat finally 's hotel going open york new
via #hoteldeparis monaco 's pictures
pleasure 's two reading en today route stories
named number diamond aaa luxury hotel us ranking amp retaining 5 2 congrats

w2v_comp
27/1/15 reporting number confirmed figures news hotel held important corinthia time al gunmen naba
's
case another workplace isolated violence
near cross bed house glazed luxury 6 detached home large #travel cuffley holiday double w waltha

In [88]:
list(doc.items())[-1]

(frozenset({'#kabari', 'coming'}), 9)

El problema con lo siguiente son los topicos: las palabras definen los tweets que se van a obtener con la búsqueda

In [45]:
# convertir topicos en word vectors
# convertir tweets del evento (representativos) en vectores
## app lowercase
## app tokenize
## del urls
# computar similitud entre topico y tweet

rnd_tweet = random.choice(tweets_this_event)
text = nlp(rnd_tweet['text'])

tweet_tokens = []

for token in text:
    if token.lower_ in stopwords.words('english') or token.lower_ not in vectors:
        continue
    tweet_tokens.append(token.lower_)

topic_tokens = []
for topic in nlp.pipe([t['topic_name'] for t in topics]):
    topic_tokens.append([token.lower_ for token in topic 
                         if token.lower_ in vectors and token.lower_ not in stopwords.words('english')])
    
print(tweet_tokens)
print()
for tokens in topic_tokens:
    print(tokens)
    print(vectors.n_similarity(tweet_tokens, tokens))
    print()

['gunmen', 'storm', 'luxury', 'hotel', 'libya', 'capital', 'least', '3', 'killed', 'hotel', 'popular', 'foreigners', 'c', '#muhamadjabal']

['car', 'bomb', 'explodes']
0.6755222282719202

['isis', 'adjudicates', 'attack']
0.6617886776347398

['report', 'amount', 'casualties']
0.6782251905755576

['hostages', 'taken']
0.7047494552877931

['report', 'number', 'attackers']
0.7330240186949215

['confrontation', 'security', 'forces']
0.6800127367584079



Idea: hacer clustering de palabras del vocabulario y samplear tweets a partir de los clusters:

100%|██████████| 14233/14233 [01:48<00:00, 130.97it/s]


In [9]:
vectorizer = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

m = vectorizer.fit_transform(tweets_tokens)
m2 = m.transpose()  # represent words instead of documents

In [10]:
all_words = np.array(vectorizer.get_feature_names())

nc = 20
km = KMeans(n_clusters=nc, n_jobs=-1, n_init=30, max_iter=500)
km.fit(m2)

print(Counter(km.labels_).most_common(nc))
print()

for i in range(nc):
    words = all_words[km.labels_ == i].tolist()
    if len(words) < 20:
        pprint(words)
        print()

[(3, 7784), (6, 6), (4, 5), (13, 5), (12, 5), (14, 5), (1, 5), (2, 4), (17, 4), (15, 4), (19, 3), (18, 2), (16, 2), (0, 2), (9, 1), (8, 1), (11, 1), (7, 1), (10, 1), (5, 1)]

['pic', 'trans']

['kill', 'killed', 'news', 'says', 'security']

['3', 'guards', 'hostages', 'take']

["'s", 'bomb', 'car', 'explodes', 'outside']

['tripoli']

['foreigner', 'linked', 'militant', 'popular', 'possibly', 'top']

['gunmen']

['capital']

['attack']

['libyan']

['design']

['corinthia', 'luxurious', 'shot', 'tuesday', 'way']

['affiliate', 'assault', 'behind', 'islamic', 'state']

['dead', 'eight', 'least', 'official', 'storm']

['exclusive', 'offers', 'special', 'visa']

['group', 'isis']

['businessweek', 'ei', 'killing', 'militants']

['5', 'foreigners']

['hotel', 'libya', 'luxury']



agglomerative clustering doesn't work well in this case:

In [11]:
agg = AgglomerativeClustering(n_clusters=20, affinity="cosine", memory="/tmp", linkage="complete")

agg.fit(m2.toarray())

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
            connectivity=None, linkage='complete', memory='/tmp',
            n_clusters=20, pooling_func=<function mean at 0x7f6711ed8ae8>)

In [12]:
for i in range(20):
    words = all_words[agg.labels_ == i].tolist()
    if len(words) < 50:
        pprint(words)
        print()

['#vofnafrica', 'dictator', 'moam', 'ouster', '|']

['gulbenkian', 'km', 'lisbon', 'located', 'mi', 'rios', 'sete', 'within', 'zoo']

['#yugvani', 'claim', 'zee']

['appears', 'reveal', '~10:00']

['art', 'dolder', 'incomparable', 'zurich']

['african', 'news24', '|via']

['blaze', '~via']

['#architecture', 'zermatt']

['girls', 'menara', 'ze']

['zzzzzz']

['~inhabitat']

['ali', 'd.', 'david', 'kirkpatrick', 'suliman', 'zway']

['bbcnews', '|thedailypr']

['#eventsus', '#ticket', '12,500', 'nfl', 'xlix', 'zone']

['bachelor', 'gadgets', '|the']

['happens', 'zokmed']

['baca', 'lt;-', 'selengkapnya', '|news|=']

['zealand']

['||']



In [13]:
vectors.similar_by_word('santiago', topn=100)

2018-06-15 12:32:58,021 : precomputing L2-norms of word weight vectors


[('saintiago', 0.8565769791603088),
 ('#santiago', 0.841245174407959),
 ('santia', 0.8093695640563965),
 ('nacional', 0.7871439456939697),
 ('#ripsantiago', 0.7835432291030884),
 ('cristobal', 0.7817075252532959),
 ('chilea', 0.7602672576904297),
 ('santo', 0.7583289742469788),
 ('pestanosantiago', 0.7576349377632141),
 ('santiam', 0.7572433948516846),
 ('chilean', 0.7534645795822144),
 ('chileans', 0.7500560283660889),
 ('vicente', 0.7482686042785645),
 ('neuquen', 0.7467107176780701),
 ('pacheco', 0.7450253963470459),
 ('centro', 0.7428617477416992),
 ('penitencia', 0.7422252297401428),
 ('#antoniosantiago', 0.7407231330871582),
 ('monterrey', 0.7407140731811523),
 ('#erupcionvillarrica', 0.7407123446464539),
 ('agencia', 0.7366681694984436),
 ('#santiagobernabeu', 0.7362847328186035),
 ('libertadores', 0.7348600625991821),
 ('camino', 0.733985185623169),
 ('santos', 0.7329348921775818),
 ('ensenada', 0.732906699180603),
 ('chileing', 0.7327314615249634),
 ('strano', 0.73221921920776