In [1]:
from gen_model2 import gen_model, load_data

In [52]:
from collections import defaultdict
from pathlib import Path

from tqdm import tqdm_notebook as tqdm

from gensim.models import KeyedVectors
import spacy

from numpy.linalg import norm
from scipy.stats.mstats import gmean
import numpy as np

In [7]:
export_path = Path('2019-01-08-event_data')

export_path / "hola"

PosixPath('2019-01-08-event_data/hola')

In [2]:
models = {
    'libya': gen_model('libya_hotel_tweets.tsv'),
    'pistorius': gen_model('oscar_pistorius_tweets.tsv'),
    'nepal': gen_model('nepal_tweets.tsv')
}

100%|██████████| 26331/26331 [00:00<00:00, 982546.74it/s]
100%|██████████| 112260/112260 [00:00<00:00, 986901.26it/s]
100%|██████████| 503660/503660 [00:00<00:00, 898550.56it/s]


# Event data

## tweet ids, texts, replies, rts

In [14]:
with (export_path / 'event_data.tsv').open('w') as f:
    f.write('event\ttweet_id\tretweet_id\treply_id\ttext\n')
    for event, info in tqdm(models.items()):
        for tweet_id, tweet in info['event_data'].items():
            text = tweet.text
            text = text.replace('"', "'")
            text = text.split()
            text = ' '.join(text)
            text = f'"{text}"'
            
            rt_id = tweet.retweet_id if tweet.retweet_id != "NULL" else "NA"
            rp_id = tweet.reply_id if tweet.reply_id != "NULL" else "NA"
            
            f.write(f'{event}\t{tweet_id}\t{rt_id}\t{rp_id}\t{text}\n')

100%|██████████| 3/3 [00:01<00:00,  1.95it/s]


## urls for each tweet

In [11]:
# example:
models['libya']['event_data']['560028495792050176']

Tweet(tweet_id='560028495792050176', retweet_id='560027393432502272', quote_id='NULL', reply_id='NULL', short_urls=['http://t.co/5TK7TsFd6P', 'http://t.co/jszG3p'], expanded_urls={0: 'https://www.rt.com/news/226603-libya-tripoli-gunmen-seige/', 1: None}, text='#CorinthiaHotel: Suicide bombers &amp; 5 masked gunmen attack hotel, hostages on top floor  http://t.co/5TK7TsFd6P #Lybia http://t.co/jszG3p', created_at='2015-01-27 10:56:12')

In [13]:
with (export_path / 'event_data_urls.tsv').open('w') as f:
    f.write('event\ttweet_id\turl\n')
    for event, info in tqdm(models.items()):
        for tweet_id, tweet in info['event_data'].items():
            expanded_urls = tweet.expanded_urls
            
            for url in expanded_urls.values():
                if url:
                    f.write(f'{event}\t{tweet_id}\t{url}\n')

100%|██████████| 3/3 [00:00<00:00,  6.42it/s]


# Model data

In [16]:
ft_model = KeyedVectors.load_word2vec_format('data_for_model/all_tweets_300.vec')

In [21]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])

## Version 1: sum of vectors

In [27]:
vectors_sum = dict()

for event, comp in tqdm(components.items()):
    vectors_sum[event] = dict()
    
    for key, tweet_ids in tqdm(comp.items()):
        vec = np.zeros(300)
        for tweet_id in tweet_ids:
            tweet = models[event]['event_data'][tweet_id]
            text = tweet.text
            for word in [token.lower_ for token in nlp(text) if token.lower_ in ft_model]:
                vec += ft_model[word]
        vec = vec / norm(vec)
        vectors_sum[event][key] = vec

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3399), HTML(value='')))

HBox(children=(IntProgress(value=0, max=9640), HTML(value='')))

  del sys.path[0]


HBox(children=(IntProgress(value=0, max=22915), HTML(value='')))




In [30]:
with (export_path / 'model_vectors_sum.tsv').open('w') as f:
    for event, vecs in tqdm(vectors_sum.items()):
        for key, vec in vecs.items():
            vec_str = '\t'.join(str(val) for val in vec)
            f.write(f'{event}\t{key}\t{vec_str}\n')


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [31]:
del vectors_sum

### ???

Estará bien normalizar los vectores de cada palabra al sumarlos?

```

for word in [token.lower_ for token in nlp(text) if token.lower_ in ft_model]:
                vec += (ft_model[word] / norm(ft_model[word]))        
            vecs_comp.append(vec)
        
        vecs_comp = np.mean(vecs_comp, axis=0)
        vecs_comp = vecs_comp / norm(vecs_comp)```

## Version 2: avg of vectors

In [40]:
vectors_avg = dict()

for event, comp in tqdm(components.items()):
    vectors_avg[event] = dict()
    
    for key, tweet_ids in tqdm(comp.items()):
        vecs_comp = []
        
        for tweet_id in tweet_ids:
            tweet = models[event]['event_data'][tweet_id]
            text = tweet.text
            vec = np.zeros(300)
            for word in [token.lower_ for token in nlp(text) if token.lower_ in ft_model]:
                vec += (ft_model[word] / norm(ft_model[word]))        
            vecs_comp.append(vec)
        
        vecs_comp = np.mean(vecs_comp, axis=0)
        vecs_comp = vecs_comp / norm(vecs_comp)

        vectors_avg[event][key] = vecs_comp

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3399), HTML(value='')))

HBox(children=(IntProgress(value=0, max=9640), HTML(value='')))



HBox(children=(IntProgress(value=0, max=22915), HTML(value='')))




In [41]:
with (export_path / 'model_vectors_avg.tsv').open('w') as f:
    for event, vecs in tqdm(vectors_avg.items()):
        for key, vec in vecs.items():
            vec_str = '\t'.join(str(val) for val in vec)
            f.write(f'{event}\t{key}\t{vec_str}\n')


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [51]:
del vectors_avg

NameError: name 'vectors_avg' is not defined

## Version 3: weighted with tf-idf