In [1]:
from pymongo import MongoClient
from bson.objectid import ObjectId
from collections import OrderedDict
import random
import logging
from datetime import datetime
from functools import lru_cache
from paths import data_path
from pathlib import Path
from gensim.models import KeyedVectors
from tqdm import tqdm
from pprint import pprint
import spacy
from nltk.corpus import stopwords
import pyemd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

client = MongoClient('mongodb://localhost:27017')
db = client.twitter_news

counters = dict()

total_events = 3

@lru_cache(maxsize=total_events)
def get_representatives(event_id):
    _info("getting representatives")
    representatives = db.representatives.find({'event': ObjectId(event_id)})
    return list(representatives)


@lru_cache(maxsize=total_events)
def get_topics(event_id):
    _info("getting topics")
    topics = list(db.topics.find({'event': ObjectId(event_id)}))
    for t in topics:
        if t['topic_name'] == "Non relevant":
            comodin = t
            topics.remove(t)
            break
    return topics, comodin


@lru_cache(maxsize=1)
def get_events():
    _info("getting events")
    events = db.events.find()
    return list(events)


@lru_cache(maxsize=1)
def get_tweets(a=None):
    _info('getting all tweets')
    all_tweets = db.tweets.find()
    return list(all_tweets)


@lru_cache(maxsize=3)
def get_vectors(path):
    _info(f"loading fasttext vectors from {path}")
    word_vectors = KeyedVectors.load_word2vec_format(path)
    return word_vectors

In [3]:
nlp = spacy.load('en_core_web_sm', tagger=False, entity=False, matcher=False)

def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index, token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc

nlp.add_pipe(hashtag_pipe)

In [4]:
path = data_path / Path('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')
ft_comp = get_vectors(path.as_posix())

#path = data_path / Path('/home/mquezada/anchor-text-twitter/data/ft_all_tweets_1line1tweet.vec')
#ft_all = get_vectors(path.as_posix())

#path = data_path / Path('/home/mquezada/anchor-text-twitter/data/w2v_all_tweets.txt')
#w2v_all = get_vectors(path.as_posix())

#path = data_path / Path('/home/mquezada/anchor-text-twitter/data/w2v_all_tweets_1_line_1_component.txt')
#w2v_comp = get_vectors(path.as_posix())

2018-06-20 18:07:15,638 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-20 18:07:15,641 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-20 18:08:18,772 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec


In [41]:
w = 'iphone'
w2 = 'earthquake'

print("ft_comp")
pprint(ft_comp.similarity(w, w2))
print()
print("ft_all")
pprint(ft_all.similarity(w, w2))
print()
print("w2v_comp")
pprint(w2v_comp.similarity(w, w2))
print()
print("w2v_all")
pprint(w2v_all.similarity(w, w2))


ft_comp
0.33911799442295903

ft_all
0.3300593981594927

w2v_comp
0.09744087254883523

w2v_all
0.1679580041090767


In [6]:
events = get_events()
events

2018-06-20 18:08:30,645 : getting events


[{'_id': ObjectId('5b171725da870923dcb0478f'),
  'event_name': 'libya_hotel_tweets.tsv',
  'description': 'In January 2015, the Corinthia Hotel in Tripoli was attacked by men affiliated with the Islamic State of Iraq and the Levant (ISIL). The hotel was popular with foreign officials and government workers; it had previously housed the Libyan Prime Minister.',
  'human_name': '2015 Corinthia Hotel attack'},
 {'_id': ObjectId('5b171726da870923dcb04790'),
  'event_name': 'oscar_pistorius_tweets.tsv',
  'description': 'The trial of Oscar Pistorius for the murder of Reeva Steenkamp and several gun-related charges (The State vs Oscar Pistorius) in the High Court of South Africa in Pretoria opened on 3 March 2014.',
  'human_name': 'Trial of Oscar Pistorius'},
 {'_id': ObjectId('5b171726da870923dcb04791'),
  'event_name': 'nepal_tweets.tsv',
  'description': 'The April 2015 Nepal earthquake (also known as the Gorkha earthquake)[5][8] killed nearly 9,000 people and injured nearly 22,000. It o

---

Obtener los tweets del evento y crear mapa de representantes-tweets

In [7]:
all_tweets = get_tweets()
events = get_events()
event = events[2]

representatives = get_representatives(event['_id'])
topics, _ = get_topics(event['_id'])

rep_tweet = dict()
for t in tqdm(all_tweets):
    rep_tweet[t['representative']] = t

rep_set = set([r['_id'] for r in representatives])

2018-06-20 18:08:37,981 : getting all tweets
2018-06-20 18:08:43,184 : getting representatives
2018-06-20 18:08:43,580 : getting topics
100%|██████████| 642251/642251 [00:00<00:00, 1182423.46it/s]


In [8]:
# asociacion entre representativos y tweets

tweets_this_event = [t for r, t in rep_tweet.items() if r in rep_set]
len(tweets_this_event)

144837

Listar topicos:

In [9]:
for topic in topics:
    print(topic['topic_name'])

Avalanche in Mount Everest
Death tolls
Reports on the magnitude of the earthquake
Rescue of people
Ways to help
International aid
Destruction of historical buildings
Humanitarian crisis
Destruction of buildings
Replicas of the earthquake


Generar tokens por cada tweet en `tweet_tokens`

In [11]:
# gen tokens for each tweet in the event

tweets_tokens = set()
for doc in tqdm(nlp.pipe(map(lambda t: t['text'], tweets_this_event), n_threads=8, batch_size=1024), 
                total=len(tweets_this_event)):
    
    tokens = frozenset([token.lower_ 
                        for token in doc 
                        if token.lower_ not in stopwords.words('english') and token.lower_ in ft_comp])
    
    if tokens: 
        tweets_tokens.add(tokens)

100%|██████████| 144837/144837 [07:42<00:00, 313.39it/s]


In [16]:
t

{'_id': ObjectId('5b1717abda870923dcbdc47c'),
 'tweet_id': 592008412004798464,
 'text': 'Nepal was hit with a 7.9 earthquake. Over 1000 people have lost their lives. If you can, please consider donating http://t.co/MAUEatbNRs',
 'created_at': datetime.datetime(2015, 4, 25, 16, 52, 59),
 'retweet_id': None,
 'reply_id': None,
 'short_urls': ['http://t.co/MAUEatbNRs'],
 'expanded_urls': ['https://www.globalgiving.org/projects/nepal-earthquake-relief-fund/'],
 'representative': ObjectId('5b1717abda870923dcbdc47b')}

In [13]:
def sim2(tokens_a, tokens_b):
    return ft_comp.n_similarity(tokens_a, tokens_b)

def mmr(docs, q, lambda_, sim):
    selected = OrderedDict()
    while set(selected) != docs:
        remaining = docs - set(selected)
        mmr_score = lambda x: lambda_ * sim(x, q) - (1 - lambda_) * max([sim(x, y) for y in set(selected) - {x}] or [0])
        next_selected = argmax(remaining, mmr_score)
        selected[next_selected] = len(selected)
        yield selected   

def argmax(keys, f):
    return max(keys, key=f)

In [15]:
lambda_ = 0.7

for topic in topics:
    query = set(map(lambda s: s.lower(), topic['topic_name'].split()))

    print(query)
    print()
    
    i = 0
    for doc in mmr(tweets_tokens, query, lambda_, sim2):
        print(' '.join(list(doc.items())[-1][0]))
        i += 1
        if i == 10:
            break
    
    print()
    print()

{'in', 'avalanche', 'mount', 'everest'}

avalanche unleashed everest news mount earthquake
avalanche mount everest nepal
avalanche british climber everest help mount appeals
reports climbers everest mount nepalearthquake
avalanche climber aftermath everest region describes mount watch
avalanche deadly triggers everest #nepalearthquake mount via
climbing hit avalanche mount everest would like earthquake
climbing avalanche nepal quake everest survives triggered man mount wisconsin
avalanche killed everest 8 mount earthquake
climbers goes nepal everest heart mount


{'death', 'tolls'}

nepal toll death sad rising
1,000 nepal tolls 's heartbreaking death
death rises toll
story updated toll reflect increased death
continues nepal toll rise death
continues nepal toll horrible people increase death thoughts
1,200 goes nepal toll people heart exceeds death
goes toll nepal heart death rising
nepal toll exponentially death rising
alarming nepal toll rate hour death sad rising


{'reports', 'on',

In [88]:
list(doc.items())[-1]

(frozenset({'#kabari', 'coming'}), 9)

El problema con lo siguiente son los topicos: las palabras definen los tweets que se van a obtener con la búsqueda

In [45]:
# convertir topicos en word vectors
# convertir tweets del evento (representativos) en vectores
## app lowercase
## app tokenize
## del urls
# computar similitud entre topico y tweet

rnd_tweet = random.choice(tweets_this_event)
text = nlp(rnd_tweet['text'])

tweet_tokens = []

for token in text:
    if token.lower_ in stopwords.words('english') or token.lower_ not in vectors:
        continue
    tweet_tokens.append(token.lower_)

topic_tokens = []
for topic in nlp.pipe([t['topic_name'] for t in topics]):
    topic_tokens.append([token.lower_ for token in topic 
                         if token.lower_ in vectors and token.lower_ not in stopwords.words('english')])
    
print(tweet_tokens)
print()
for tokens in topic_tokens:
    print(tokens)
    print(vectors.n_similarity(tweet_tokens, tokens))
    print()

['gunmen', 'storm', 'luxury', 'hotel', 'libya', 'capital', 'least', '3', 'killed', 'hotel', 'popular', 'foreigners', 'c', '#muhamadjabal']

['car', 'bomb', 'explodes']
0.6755222282719202

['isis', 'adjudicates', 'attack']
0.6617886776347398

['report', 'amount', 'casualties']
0.6782251905755576

['hostages', 'taken']
0.7047494552877931

['report', 'number', 'attackers']
0.7330240186949215

['confrontation', 'security', 'forces']
0.6800127367584079



Idea: hacer clustering de palabras del vocabulario y samplear tweets a partir de los clusters:

100%|██████████| 14233/14233 [01:48<00:00, 130.97it/s]


In [9]:
vectorizer = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

m = vectorizer.fit_transform(tweets_tokens)
m2 = m.transpose()  # represent words instead of documents

In [10]:
all_words = np.array(vectorizer.get_feature_names())

nc = 20
km = KMeans(n_clusters=nc, n_jobs=-1, n_init=30, max_iter=500)
km.fit(m2)

print(Counter(km.labels_).most_common(nc))
print()

for i in range(nc):
    words = all_words[km.labels_ == i].tolist()
    if len(words) < 20:
        pprint(words)
        print()

[(3, 7784), (6, 6), (4, 5), (13, 5), (12, 5), (14, 5), (1, 5), (2, 4), (17, 4), (15, 4), (19, 3), (18, 2), (16, 2), (0, 2), (9, 1), (8, 1), (11, 1), (7, 1), (10, 1), (5, 1)]

['pic', 'trans']

['kill', 'killed', 'news', 'says', 'security']

['3', 'guards', 'hostages', 'take']

["'s", 'bomb', 'car', 'explodes', 'outside']

['tripoli']

['foreigner', 'linked', 'militant', 'popular', 'possibly', 'top']

['gunmen']

['capital']

['attack']

['libyan']

['design']

['corinthia', 'luxurious', 'shot', 'tuesday', 'way']

['affiliate', 'assault', 'behind', 'islamic', 'state']

['dead', 'eight', 'least', 'official', 'storm']

['exclusive', 'offers', 'special', 'visa']

['group', 'isis']

['businessweek', 'ei', 'killing', 'militants']

['5', 'foreigners']

['hotel', 'libya', 'luxury']



agglomerative clustering doesn't work well in this case:

In [11]:
agg = AgglomerativeClustering(n_clusters=20, affinity="cosine", memory="/tmp", linkage="complete")

agg.fit(m2.toarray())

AgglomerativeClustering(affinity='cosine', compute_full_tree='auto',
            connectivity=None, linkage='complete', memory='/tmp',
            n_clusters=20, pooling_func=<function mean at 0x7f6711ed8ae8>)

In [12]:
for i in range(20):
    words = all_words[agg.labels_ == i].tolist()
    if len(words) < 50:
        pprint(words)
        print()

['#vofnafrica', 'dictator', 'moam', 'ouster', '|']

['gulbenkian', 'km', 'lisbon', 'located', 'mi', 'rios', 'sete', 'within', 'zoo']

['#yugvani', 'claim', 'zee']

['appears', 'reveal', '~10:00']

['art', 'dolder', 'incomparable', 'zurich']

['african', 'news24', '|via']

['blaze', '~via']

['#architecture', 'zermatt']

['girls', 'menara', 'ze']

['zzzzzz']

['~inhabitat']

['ali', 'd.', 'david', 'kirkpatrick', 'suliman', 'zway']

['bbcnews', '|thedailypr']

['#eventsus', '#ticket', '12,500', 'nfl', 'xlix', 'zone']

['bachelor', 'gadgets', '|the']

['happens', 'zokmed']

['baca', 'lt;-', 'selengkapnya', '|news|=']

['zealand']

['||']



In [13]:
vectors.similar_by_word('santiago', topn=100)

2018-06-15 12:32:58,021 : precomputing L2-norms of word weight vectors


[('saintiago', 0.8565769791603088),
 ('#santiago', 0.841245174407959),
 ('santia', 0.8093695640563965),
 ('nacional', 0.7871439456939697),
 ('#ripsantiago', 0.7835432291030884),
 ('cristobal', 0.7817075252532959),
 ('chilea', 0.7602672576904297),
 ('santo', 0.7583289742469788),
 ('pestanosantiago', 0.7576349377632141),
 ('santiam', 0.7572433948516846),
 ('chilean', 0.7534645795822144),
 ('chileans', 0.7500560283660889),
 ('vicente', 0.7482686042785645),
 ('neuquen', 0.7467107176780701),
 ('pacheco', 0.7450253963470459),
 ('centro', 0.7428617477416992),
 ('penitencia', 0.7422252297401428),
 ('#antoniosantiago', 0.7407231330871582),
 ('monterrey', 0.7407140731811523),
 ('#erupcionvillarrica', 0.7407123446464539),
 ('agencia', 0.7366681694984436),
 ('#santiagobernabeu', 0.7362847328186035),
 ('libertadores', 0.7348600625991821),
 ('camino', 0.733985185623169),
 ('santos', 0.7329348921775818),
 ('ensenada', 0.732906699180603),
 ('chileing', 0.7327314615249634),
 ('strano', 0.73221921920776