In [3]:
import logging
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from numba import jit
import numpy as np

import spacy
from bson.objectid import ObjectId
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from pymongo import MongoClient
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

client = MongoClient('mongodb://localhost:27017')
db = client.twitter_news
nlp = spacy.load('en_core_web_sm', tagger=False, entity=False, matcher=False)


def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index, token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc


nlp.add_pipe(hashtag_pipe)


total_events = 3


@lru_cache(maxsize=total_events)
def get_representatives(event_id):
    _info("getting representatives")
    representatives = db.representatives.find({'event': ObjectId(event_id)})
    return list(representatives)


@lru_cache(maxsize=total_events)
def get_topics(event_id):
    _info("getting topics")
    topics = list(db.topics.find({'event': ObjectId(event_id)}))
    comodin = None
    for t in topics:
        if t['topic_name'] == "Non relevant":
            comodin = t
            topics.remove(t)
            break
    return topics, comodin


@lru_cache(maxsize=1)
def get_events():
    _info("getting events")
    events = db.events.find()
    return list(events)


@lru_cache(maxsize=1)
def get_tweets(a=None):
    _info('getting all tweets')
    all_tweets = db.tweets.find()
    return list(all_tweets)


@lru_cache(maxsize=3)
def get_vectors(path):
    _info(f"loading fasttext vectors from {path}")
    word_vectors = KeyedVectors.load_word2vec_format(path)
    return word_vectors


@lru_cache(maxsize=2**30)
def sim(tokens_a, tokens_b):
    return ft_comp.n_similarity(tokens_a, tokens_b)


def mmr(docs, query, l):
    def mmr_score(tweet):
        return l * sim(docs[tweet], query) - \
               (1 - l) * max([sim(docs[tweet], docs[y]) for y in set(selected) - {tweet}] or [0])

    L = np.array([[l, 0], [0, l - 1]])

    def score(tweet):
        s1 = sim(docs[tweet], query)
        s2 = np.max(np.array([sim(docs[tweet], docs[y]) for y in set(selected) - {tweet}] or [0]))

        return L.dot(np.array([s1, s2])).sum()

    selected = set()
    while selected != set(docs):
        remaining = list(set(docs) - selected)
        next_selected = max(remaining, key=mmr_score)
        # next_selected = remaining[np.argmax([score(t) for t in remaining])]

        # next_selected = None
        # max_score = 0
        #
        # for _t in remaining:
        #     score = l * sim(docs[_t], query) - \
        #             (1 - l) * max([sim(docs[_t], docs[y]) for y in set(selected) - {_t}] or [0])
        #     if score > max_score:
        #         max_score = score
        #         next_selected = _t

        selected.add(next_selected)
        yield next_selected, ' '.join(list(docs[next_selected]))


@lru_cache(maxsize=total_events)
def process_tweets(event_id):
    all_tweets = get_tweets()
    representatives = get_representatives(event_id)

    _info("processing tweets")

    # rep_tweet: repr_id => tweet
    rep_tweet = dict()
    for t in tqdm(all_tweets):
        rep_tweet[t['representative']] = t

    # repr_ids: {repr_id} // this event
    repr_ids = set([r['_id'] for r in representatives])

    # tweets_this_event: [tweet]
    tweets_this_event = [t for r, t in rep_tweet.items() if r in repr_ids]

    tweets_tokens = dict()
    all_tokens = set()
    for tweet, doc in tqdm(zip(tweets_this_event, nlp.pipe([_t['text'] for _t in tweets_this_event],
                                                           n_threads=8,
                                                           batch_size=1024)),
                           total=len(tweets_this_event)):

        tokens = frozenset([token.lower_
                            for token in doc
                            if token.lower_ not in stopwords.words('english') and token.lower_ in ft_comp])

        if tokens and tokens not in all_tokens:
            tweets_tokens[str(tweet['_id'])] = tokens
            all_tokens.add(tokens)

    return tweets_tokens


path = Path('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')
ft_comp = get_vectors(path.as_posix())
events = get_events()

topics = list()
for event in events:
    topics.append([frozenset([t.lower()
                              for t in topic['topic_name'].split()])
                   for topic in get_topics(event['_id'])[0]])

2018-06-28 14:44:18,250 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-28 14:44:18,251 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-28 14:45:18,491 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-28 14:45:18,491 : getting events
2018-06-28 14:45:18,493 : getting topics
2018-06-28 14:45:18,494 : getting topics
2018-06-28 14:45:18,495 : getting topics


In [26]:
event = events[0]
tweets_tokens = process_tweets(event['_id'])

topics_this_event, _ = get_topics(event['_id'])
topic = topics_this_event[0]
topic_tokens = frozenset([t.lower() for t in topic['topic_name'].split()])
topic_tokens

frozenset({'bomb', 'car', 'explodes'})

In [1]:
event

NameError: name 'event' is not defined

In [27]:
sim.cache_clear()

In [28]:
%%time

for i, obj in enumerate(mmr(tweets_tokens, topic_tokens, 0.6)):
    print(obj)
    if i == 10:
        break

('5b17172ada870923dcb095b6', 'park bomb explodes corinthia car')
('5b17172ada870923dcb09245', 'hotel libya outside bomb capital explodes witnesses car')
('5b171729da870923dcb08977', 'hotel via outside bomb explodes tripoli car')
('5b171729da870923dcb08b23', 'hotel outside bomb explodes tripoli car')
('5b171729da870923dcb08894', '#tripoli hotel outside bomb explodes east car middle')
('5b171729da870923dcb08de7', '#world hotel outside bomb explodes tripoli car')
('5b17172ada870923dcb09414', 'sale hotel outside bomb explodes tripoli car #news')
('5b171729da870923dcb08e25', 'hotel #civil_vision outside bomb explodes tripoli car')
('5b17172ada870923dcb09379', 'post hotel outside bomb explodes car tripoli new')
('5b171729da870923dcb08ac5', 'hotel 3 outside bomb car kills')
('5b171729da870923dcb08c2c', 'latest hotel outside bomb explodes tripoli car #news')
CPU times: user 4.75 s, sys: 3.32 ms, total: 4.76 s
Wall time: 4.75 s


In [5]:
%%time

from scipy.linalg import norm
from numba import jit
import random

event = events[1]
tweets_tokens = process_tweets(event['_id'])

lambda_ = 0.6
_1_lambda_ = 1 - lambda_

n_docs = len(tweets_tokens)
tweet_vectors = np.array([np.mean([ft_comp[token] for token in tokens], axis=0) for _, tokens in tweets_tokens.items()])
norm_vectors = tweet_vectors / norm(tweet_vectors)

@lru_cache(maxsize=2**20)
@jit
def sim_c(doc_x, doc_y):
    return norm_vectors[doc_x].dot(norm_vectors[doc_y].T)

topics_this_event, _ = get_topics(event['_id'])

for topic in topics_this_event:  
    query = topic.get('keywords')
    
    if not query:
        continue

    query = np.mean([ft_comp[token] for token in query if token in ft_comp], axis=0)
    doc_q_sim = (norm_vectors).dot(query) * lambda_

    @jit
    def score(doc_x, selected):
        max_sim = 0
        if selected - {doc_x}:
            for doc_y in selected - {doc_x}:
                sim = sim_c(doc_x, doc_y)
                if sim > max_sim:
                    max_sim = sim

        return doc_q_sim[doc_x] - _1_lambda_ * max_sim

    def mmr2(docs, lim=500):
        selected = set()

        for i in range(lim):
            remaining = set(docs) - selected

            max_score = 0
            next_selected = None
            for r in remaining:
                sc = score(r, frozenset(selected))
                if sc > max_score:
                    max_score = sc
                    next_selected = r

            selected.add(next_selected)
            yield next_selected
            

    #sample = random.sample(list(tweets_tokens), len(tweets_tokens) // 2)
    sample = list(tweets_tokens)
    export_dir = Path('/home/mquezada/tweet_topics/')
    print(topic['topic_name'])

    f_name = f'event_{event["_id"]}-topic_{topic["_id"]}-tweet_ids_sorted_mmr.txt'
    with (export_dir / Path(f_name)).open('w') as f:
        for i in tqdm(mmr2(range(len(sample)))):
            f.write(f'{sample[i]}\n')   


2018-06-29 09:42:10,496 : getting representatives
2018-06-29 09:42:10,732 : processing tweets
100%|██████████| 642251/642251 [00:00<00:00, 1305070.46it/s]
100%|██████████| 82626/82626 [04:34<00:00, 301.35it/s]
0it [00:00, ?it/s]

Oscar Pistorius apologizes


500it [3:35:49, 25.90s/it]
0it [00:00, ?it/s]

Oscar Pistorius vomits on court


296it [1:24:19, 17.09s/it]

KeyboardInterrupt: 

In [55]:
sample

['5b1717aada870923dcbdb0d3',
 '5b171791da870923dcbaef75',
 '5b17179cda870923dcbc39bf',
 '5b17176bda870923dcb60a2c',
 '5b171777da870923dcb7ad83',
 '5b17177ada870923dcb8067f',
 '5b1717a8da870923dcbd75e0',
 '5b171778da870923dcb7b500',
 '5b17179cda870923dcbc3ccd',
 '5b171772da870923dcb6fac3',
 '5b1717a6da870923dcbd522d',
 '5b17177bda870923dcb81298',
 '5b1717aada870923dcbdac05',
 '5b17179eda870923dcbc6a72',
 '5b17176eda870923dcb6861d',
 '5b1717a8da870923dcbd7bb4',
 '5b17176ada870923dcb5f90e',
 '5b17178bda870923dcba3add',
 '5b171764da870923dcb51dcc',
 '5b171770da870923dcb6b9e6',
 '5b17177bda870923dcb8188b',
 '5b171781da870923dcb8d4ac',
 '5b171771da870923dcb6d1d9',
 '5b171775da870923dcb73f0c',
 '5b171787da870923dcb9bf27',
 '5b171796da870923dcbb8831',
 '5b17177dda870923dcb8480b',
 '5b171764da870923dcb51f92',
 '5b171765da870923dcb5306e',
 '5b171793da870923dcbb26da',
 '5b171794da870923dcbb40e8',
 '5b1717abda870923dcbdb7a7',
 '5b171765da870923dcb541c7',
 '5b171795da870923dcbb7743',
 '5b17176dda87

In [49]:
d = {'a': 1, 'b': 2}
random.sample(list(d), 1)
tweets_tokens

{'5b17175bda870923dcb3df5c': frozenset({'#nepal',
            '7.5',
            'amp',
            'buildings',
            'could',
            'delhi',
            'earthquake',
            'far',
            'felt',
            'hits',
            'magnitude',
            'major',
            'quake',
            'roads'}),
 '5b17175bda870923dcb3e017': frozenset({'buildings',
            'cnn',
            'earthquake',
            'hits',
            'ibn',
            'major',
            'nepal',
            'reports',
            'roads'}),
 '5b17175bda870923dcb3e019': frozenset({'7.7',
            'buildings',
            'capital',
            'collapse',
            'nepal',
            'quake',
            'witnesses'}),
 '5b17175bda870923dcb3e068': frozenset({'air',
            'bad',
            'buildings',
            'comming',
            'dust',
            'felt',
            'many',
            'nepal',
            'news',
            'pall',
            'region',


In [34]:
sim_c.cache_info()

CacheInfo(hits=327495, misses=72795, maxsize=1073741824, currsize=72795)

In [35]:
@lru_cache(maxsize=2**30)
def sim_l(tokens_a, tokens_b):
    return ft_comp.n_similarity(tokens_a, tokens_b)


def mmr3(docs, query, l):
    def mmr_score(tweet):
        return l * sim_l(docs[tweet], query) - \
               (1 - l) * max([sim_l(docs[tweet], docs[y]) for y in set(selected) - {tweet}] or [0])


    selected = set()
    while selected != set(docs):
        remaining = list(set(docs) - selected)
        next_selected = max(remaining, key=mmr_score)

        selected.add(next_selected)
        yield next_selected

In [39]:
sim_l.cache_clear()

In [40]:
%%time

for i, obj in enumerate(mmr3(tweets_tokens, topic_tokens, 0.6)):
    print(obj)
    if i == 10:
        break

5b17172ada870923dcb095b6
5b17172ada870923dcb09245
5b171729da870923dcb08977
5b171729da870923dcb08b23
5b171729da870923dcb08894
5b171729da870923dcb08de7
5b17172ada870923dcb09414
5b171729da870923dcb08e25
5b17172ada870923dcb09379
5b171729da870923dcb08ac5
5b171729da870923dcb08c2c
CPU times: user 5.21 s, sys: 0 ns, total: 5.21 s
Wall time: 5.2 s


In [44]:
import ipyparallel as ipp
    
ids = list(tweets_tokens.keys())

for i, tid in enumerate(mmr2(range(len(ids)))):
    print(tweets_tokens[ids[tid]])
    if i == 10:
        break

[0, 1, 2, 3]


            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@mquezada-MS-7A46')
            or instruct your controller to listen on an external IP.


['Hello, World', 'Hello, World', 'Hello, World', 'Hello, World']