In [4]:
from scipy.linalg import norm
from numba import jit
import random
import multiprocessing
import re
from operator import itemgetter

import logging
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from numba import jit
import numpy as np

import spacy
from bson.objectid import ObjectId
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from pymongo import MongoClient
from tqdm import tqdm

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

client = MongoClient('mongodb://localhost:27017')
db = client.twitter_news
nlp = spacy.load('en_core_web_sm', tagger=False, entity=False, matcher=False)

def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index, token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc


nlp.add_pipe(hashtag_pipe)

In [6]:
total_events = 3


@lru_cache(maxsize=total_events)
def get_representatives(event_id):
    _info("getting representatives")
    representatives = db.representatives.find({'event': ObjectId(event_id)})
    return list(representatives)


@lru_cache(maxsize=total_events)
def get_topics(event_id):
    _info("getting topics")
    topics = list(db.topics.find({'event': ObjectId(event_id)}))
    comodin = None
    for t in topics:
        if t['topic_name'] == "Non relevant":
            comodin = t
            topics.remove(t)
            break
    return topics, comodin


@lru_cache(maxsize=1)
def get_events():
    _info("getting events")
    events = db.events.find()
    return list(events)


@lru_cache(maxsize=1)
def get_tweets(a=None):
    _info('getting all tweets')
    all_tweets = db.tweets.find()
    return list(all_tweets)


@lru_cache(maxsize=3)
def get_vectors(path):
    _info(f"loading fasttext vectors from {path}")
    word_vectors = KeyedVectors.load_word2vec_format(path)
    return word_vectors


@lru_cache(maxsize=2**30)
def sim(tokens_a, tokens_b):
    return ft_comp.n_similarity(tokens_a, tokens_b)


def mmr(docs, query, l):
    def mmr_score(tweet):
        return l * sim(docs[tweet], query) - \
               (1 - l) * max([sim(docs[tweet], docs[y]) for y in set(selected) - {tweet}] or [0])

    L = np.array([[l, 0], [0, l - 1]])

    def score(tweet):
        s1 = sim(docs[tweet], query)
        s2 = np.max(np.array([sim(docs[tweet], docs[y]) for y in set(selected) - {tweet}] or [0]))

        return L.dot(np.array([s1, s2])).sum()

    selected = set()
    while selected != set(docs):
        remaining = list(set(docs) - selected)
        next_selected = max(remaining, key=mmr_score)
        # next_selected = remaining[np.argmax([score(t) for t in remaining])]

        # next_selected = None
        # max_score = 0
        #
        # for _t in remaining:
        #     score = l * sim(docs[_t], query) - \
        #             (1 - l) * max([sim(docs[_t], docs[y]) for y in set(selected) - {_t}] or [0])
        #     if score > max_score:
        #         max_score = score
        #         next_selected = _t

        selected.add(next_selected)
        yield next_selected, ' '.join(list(docs[next_selected]))


@lru_cache(maxsize=total_events)
def process_tweets(event_id):
    all_tweets = get_tweets()
    representatives = get_representatives(event_id)

    _info("processing tweets")

    # rep_tweet: repr_id => tweet
    rep_tweet = dict()
    for t in tqdm(all_tweets):
        rep_tweet[t['representative']] = t

    # repr_ids: {repr_id} // this event
    repr_ids = set([r['_id'] for r in representatives])

    # tweets_this_event: [tweet]
    tweets_this_event_ = [t for r, t in rep_tweet.items() if r in repr_ids]
    tweets_this_event = []
    
    # filter out tweets wo expanded urls
    for t in tweets_this_event_:
        e_u = t['expanded_urls']
        if all(u is not None for u in e_u):
            tweets_this_event.append(t)    

    tweets_tokens = dict()
    all_tokens = set()
    for tweet, doc in tqdm(zip(tweets_this_event, nlp.pipe([_t['text'] for _t in tweets_this_event],
                                                           n_threads=8)),
                           total=len(tweets_this_event)):

        tokens = frozenset([token.lower_
                            for token in doc
                            if token.lower_ not in stopwords.words('english') and token.lower_ in ft_comp])

        if tokens and tokens not in all_tokens:
            tweets_tokens[str(tweet['_id'])] = tokens
            all_tokens.add(tokens)

    return tweets_tokens


def expand_query(topics):
    # query expansion 
    topics_this_event = []
    for topic in topics:
        new_words = set()
        for keyword in topic:
            new_words |= set([word for word, _ in ft_comp.most_similar(keyword, topn=25)])
        new_topic = set(topic) | new_words
        topics_this_event.append(new_topic)
    return topics_this_event

In [8]:
path = Path('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')
data_path = Path('/home/mquezada/tweet_topics/')

In [66]:
files = list(data_path.glob('event_*-topic_*-tweet_ids_sorted_mmr.txt'))

topics_tweetids = defaultdict(list)

all_tweets = get_tweets()
all_tweets_d = dict()
for t in all_tweets:
    all_tweets_d[str(t['_id'])] = t

    
for f_0 in files:
    print(f_0)
    _, ev, top, _, _, _ = f_0.name.split("_")
    event_id = ev.split("-")[0]
    topic_id = top.split("-")[0]
    print(event_id, topic_id)
    
    with f_0.open() as f:
        for i, line in enumerate(f):
            r_id = line[:-1]
            topics_tweetids[topic_id].append(r_id)

/home/mquezada/tweet_topics/event_5b171726da870923dcb04791-topic_5b19af21da87097532fd0488-tweet_ids_sorted_mmr.txt
5b171726da870923dcb04791 5b19af21da87097532fd0488
/home/mquezada/tweet_topics/event_5b171725da870923dcb0478f-topic_5b184131da870950572be268-tweet_ids_sorted_mmr.txt
5b171725da870923dcb0478f 5b184131da870950572be268
/home/mquezada/tweet_topics/event_5b171726da870923dcb04790-topic_5b19ae78da870974f0f58bbe-tweet_ids_sorted_mmr.txt
5b171726da870923dcb04790 5b19ae78da870974f0f58bbe
/home/mquezada/tweet_topics/event_5b171726da870923dcb04790-topic_5b19ae8dda870974f0f58bc2-tweet_ids_sorted_mmr.txt
5b171726da870923dcb04790 5b19ae8dda870974f0f58bc2
/home/mquezada/tweet_topics/event_5b171726da870923dcb04791-topic_5b19af32da87097532fd048a-tweet_ids_sorted_mmr.txt
5b171726da870923dcb04791 5b19af32da87097532fd048a
/home/mquezada/tweet_topics/event_5b171726da870923dcb04791-topic_5b19af1bda87097532fd0487-tweet_ids_sorted_mmr.txt
5b171726da870923dcb04791 5b19af1bda87097532fd0487
/home/mque

In [212]:
from collections import Counter


class TopicLabeler:
    def __init__(self, topics_tweetids):
        self.topics_tweetids = topics_tweetids
        self.labeled = Counter()
        self.to_label = {topic: list(range(len(tweets))) for topic, tweets in self.topics_tweetids.items()}
        
    def sample(self):
        """
        sample tweet con proba inv. prop. a la fraccion de tweets etiquetados del mismo topico
        """
        keys = list(self.topics_tweetids.keys())
        
        totals = [len(self.topics_tweetids[k]) for k in keys]
        labeleds = [self.labeled[k] for k in keys]
        
        fr = np.array([labeled / total for total, labeled in zip(totals, labeleds)])  # in [0, 1]
        probas = 1 - fr  # higher for topics with less labels
        
        x = 1 / sum(probas)
        rnd = random.random()
        p0 = 0
        
        #print([x * p for p in probas])
        
        choice = 0
        #print(rnd)
        for i, p1 in enumerate([x * p for p in probas]):
            #print(p0, p0 + p1)
            if p0 <= rnd < p0 + p1:
                choice = i
                break
            p0 += p1
        
        topic = list(self.topics_tweetids.keys())[choice]
        if not self.to_label[topic]:
            return None  # done!
        
        #tweet = random.choice(self.to_label[topic])
        tweet = self.to_label[topic][0]
        return topic, tweet
    
    def label(self, topic_id, tweet_idx):
        self.to_label[topic_id].remove(tweet_idx)
        self.labeled.update({topic_id: 1})


In [228]:
tl = TopicLabeler({'a': ['x','y','z'], 'b': ['q','w','e','r'], 'c': ['t','y']})
tl.label('a', 0)
tl.label('a', 1)
tl.label('a', 2)
tl.label('b', 0)
tl.label('b', 1)
tl.label('b', 2)
#tl.label('b', 3)
tl.label('c', 0)
tl.label('c', 1)
sampled = Counter([tl.sample() for _ in range(100000)])
print(sampled.most_common(10))

[(('b', 3), 100000)]


In [219]:
tl = TopicLabeler(topics_tweetids)

In [223]:
tl.sample()

('5b19af11da87097532fd0485', 0)

In [224]:
tl.label('5b19af11da87097532fd0485', 0)

In [225]:
tl.sample()

('5b18412bda870950572be267', 0)

In [226]:
tl.label('5b18412bda870950572be267', 0)

In [227]:
tl.sample()

('5b184131da870950572be268', 0)

In [91]:
prop(np.array([1/4, 1/2, 1/3]))

array([0.75      , 0.5       , 0.66666667])