In [2]:
from pymongo import MongoClient
from bson.objectid import ObjectId
import random
import logging
from datetime import datetime
from functools import lru_cache
from paths import data_path
from pathlib import Path
from gensim.models import KeyedVectors
from tqdm import tqdm
from pprint import pprint
import spacy
from nltk.corpus import stopwords
import pyemd
import numpy as np
from collections import Counter
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

client = MongoClient('mongodb://localhost:27017')
db = client.twitter_news

counters = dict()

total_events = 3

@lru_cache(maxsize=total_events)
def get_representatives(event_id):
    _info("getting representatives")
    representatives = db.representatives.find({'event': ObjectId(event_id)})
    return list(representatives)


@lru_cache(maxsize=total_events)
def get_topics(event_id):
    _info("getting topics")
    topics = list(db.topics.find({'event': ObjectId(event_id)}))
    for t in topics:
        if t['topic_name'] == "Non relevant":
            comodin = t
            topics.remove(t)
            break
    return topics, comodin


@lru_cache(maxsize=1)
def get_events():
    _info("getting events")
    events = db.events.find()
    return list(events)


@lru_cache(maxsize=1)
def get_tweets(a=None):
    _info('getting all tweets')
    all_tweets = db.tweets.find()
    return list(all_tweets)


@lru_cache(maxsize=1)
def get_vectors(path):
    _info(f"loading fasttext vectors from {path}")
    word_vectors = KeyedVectors.load_word2vec_format(path)
    return word_vectors

In [4]:
nlp = spacy.load('en_core_web_sm', tagger=False, entity=False, matcher=False)

def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index, token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc

nlp.add_pipe(hashtag_pipe)

In [5]:
path = data_path / Path('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')
vectors = get_vectors(path.as_posix())

all_tweets = get_tweets()
events = get_events()
event = events[1]

representatives = get_representatives(event['_id'])
topics, _ = get_topics(event['_id'])

rep_tweet = dict()
for t in tqdm(all_tweets):
    rep_tweet[t['representative']] = t

rep_set = set([r['_id'] for r in representatives])

2018-06-15 12:34:10,280 : loading fasttext vectors from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-15 12:34:10,280 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-15 12:35:13,385 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-06-15 12:35:13,386 : getting all tweets
2018-06-15 12:35:18,845 : getting events
2018-06-15 12:35:18,847 : getting representatives
2018-06-15 12:35:19,127 : getting topics
100%|██████████| 642251/642251 [00:00<00:00, 1174059.91it/s]


In [6]:
# asociacion entre representativos y tweets

tweets_this_event = [t for r, t in rep_tweet.items() if r in rep_set]
len(tweets_this_event)

82626

In [7]:
for topic in topics:
    print(topic['topic_name'])

Oscar Pistorius apologizes
Oscar Pistorius vomits on court
Oscar Pistorius removes his prosthesis
Psychiatric evaluation
Final arguments
Pistorius pledges innocence
Paddy Powers
Witnesses
Police under investigation
Interrogatory
Shooting in a restaurant


El problema con lo siguiente son los topicos: las palabras definen los tweets que se van a obtener con la búsqueda

In [8]:
# convertir topicos en word vectors
# convertir tweets del evento (representativos) en vectores
## app lowercase
## app tokenize
## del urls
# computar similitud entre topico y tweet

rnd_tweet = random.choice(tweets_this_event)
text = nlp(rnd_tweet['text'])

tweet_tokens = []

for token in text:
    if token.lower_ in stopwords.words('english') or token.lower_ not in vectors:
        continue
    tweet_tokens.append(token.lower_)

topic_tokens = []
for topic in nlp.pipe([t['topic_name'] for t in topics]):
    topic_tokens.append([token.lower_ for token in topic 
                         if token.lower_ in vectors and token.lower_ not in stopwords.words('english')])
    
print(tweet_tokens)
print()
for tokens in topic_tokens:
    print(tokens)
    print(vectors.n_similarity(tweet_tokens, tokens))
    print()

['cnn', 'world', 'neighbor', 'heard', 'lady', 'scream', 'man', 'heard', 'woman', "'s", 'voice', 'night', 'steenkamp', 'died']

['oscar', 'pistorius', 'apologizes']
0.7109102456922951

['oscar', 'pistorius', 'vomits', 'court']
0.771266765169569

['oscar', 'pistorius', 'removes', 'prosthesis']
0.7534386777219071

['psychiatric', 'evaluation']
0.5569321009896898

['final', 'arguments']
0.7287406698576708

['pistorius', 'pledges', 'innocence']
0.7636569244937059

['paddy', 'powers']
0.6548144737134705

['witnesses']
0.6771960242053019

['police', 'investigation']
0.7503976143332373

['interrogatory']
0.6773885430319018

['shooting', 'restaurant']
0.7625576741760687



Idea: hacer clustering de palabras del vocabulario y samplear tweets a partir de los clusters:

In [9]:
tweets_tokens = []
for _tweet in tqdm(tweets_this_event):
    text = nlp(_tweet['text'])

    tweet_tokens = []

    for token in text:
        if token.lower_ in stopwords.words('english') or token.lower_ not in vectors:
            continue
        tweet_tokens.append(token.lower_)

    tweets_tokens.append(tweet_tokens)

100%|██████████| 82626/82626 [11:54<00:00, 115.71it/s]


In [10]:
vectorizer = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)

m = vectorizer.fit_transform(tweets_tokens)
m2 = m.transpose()  # represent words instead of documents

In [11]:
all_words = np.array(vectorizer.get_feature_names())

nc = 20
km = KMeans(n_clusters=nc, n_jobs=-1, n_init=30, max_iter=500)
km.fit(m2)

print(Counter(km.labels_).most_common(nc))
print()

for i in range(nc):
    words = all_words[km.labels_ == i].tolist()
    if len(words) < 20:
        pprint(words)
        print()

[(1, 16831), (13, 6), (14, 4), (8, 4), (10, 3), (7, 2), (12, 2), (2, 2), (19, 2), (3, 1), (17, 1), (11, 1), (5, 1), (4, 1), (6, 1), (18, 1), (0, 1), (15, 1), (9, 1), (16, 1)]

['murder']

['psychiatric', 'tests']

['#newslocker']

['evidence']

['court']

['girlfriend']

['cross', 'examination']

['evaluation', 'judge', 'mental', 'orders']

['prosecutor']

['oscar', 'pistorius', 'trial']

['begins']

['graphic', 'testimony']

['brother', 'car', 'carl', 'condition', 'crash', 'critical']

['arguments', 'begin', 'closing', 'final']

['news']

['vomits']

["'s"]

['live']

['reeva', 'steenkamp']



agglomerative clustering doesn't work well in this case:

In [12]:
agg = AgglomerativeClustering(n_clusters=20, affinity="cosine", memory="/tmp", linkage="complete")

agg.fit(m2.toarray())

KeyboardInterrupt: 

In [None]:
for i in range(20):
    words = all_words[agg.labels_ == i].tolist()
    if len(words) < 50:
        pprint(words)
        print()

In [None]:
vectors.similar_by_word('santiago', topn=100)