In [3]:
import db
import models
import logging
import numpy as np
import pandas as pd
from tqdm import tqdm

from tokenizer import Tokenizer
from sklearn.decomposition import PCA

tokenizer = Tokenizer()

pd.options.display.max_colwidth = 0

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s | %(name)s | %(levelname)s : %(message)s', level=logging.INFO)

2017-12-18 12:10:19,221 | tokenizer | INFO : Loading spacy model
2017-12-18 12:10:20,410 | tokenizer | INFO : Model Loaded


In [165]:
event_id = 9
n_tweets = 10

---

# Documentos vs Tweets

Obtener **documentos** del evento `event_id` con solo los representantes:

In [166]:
event = db.get_documents(event_id, full=False)
tweets = np.array(list(map(lambda x: x[0].text, event.values())))

In [192]:
df = pd.DataFrame(np.random.choice(tweets, n_tweets))
df


Unnamed: 0,0
0,Death toll rises to at least 876 in earthquake that hit Nepal http://t.co/HrsU8nRwgv
1,"#NepalQuake death toll tops 1,400, according to Nepal's National Emergency Operation Center. http://t.co/w3DV9S1oqD http://t.co/CXAxG6Zv27"
2,Sending my love to those in Nepal and everyone affected by this earthquake.
3,SCIAF launch Nepal Earthquake Emergency Appeal http://t.co/syoyXSBqdk @HumzaYousaf @NicolaSturgeon http://t.co/L2LBsLd883
4,Terrible #earthquake in #Nepal. Just saved ourselves. Don't know how many killed. Roads are blocked already. http://t.co/qsxITz0n1P
5,"Earthquakes kill scores in Nepal and India; historic Dharahara Tower collapses, Kathmandu airport shut\n#earthquake\nhttp://t.co/HUTZi3jpAQ"
6,Facebook Safety Check connects those affected by devastating Nepal earthquake. http://t.co/lRiHiu0pdv http://t.co/N1gCs5PL1g
7,earthquake in Nepal is about 7.8 M may ALLAH bless them
8,"#ModiMinistry Quake magnitude upgraded to 7.9, only 2km deep http://t.co/dmXQOqp7IM"
9,#PrayForNepal A 7.9 earthquake has killed more than 480 people. Pls join me in praying for #Nepal http://t.co/pHmrRMb5AG


---

Obtener **tweets** del evento `events_id`:

In [168]:
event2 = db.get_tweets(event_id)
tweets = np.array(list(map(lambda t: t.text, event2)))

In [182]:
df = pd.DataFrame(np.random.choice(tweets, n_tweets))
df

Unnamed: 0,0
0,"At least 688 killed in Nepal earthquake, official says. http://t.co/E8Fh03tnSi http://t.co/kunrRmzMV2"
1,Appeal for relief of Nepal Earth quake victims http://t.co/KUZ2tS7hlw #Nepal #Earthquake #SevaBharathi
2,Google has launched a person finder tool to help locate those affected by the earthquake in Nepal. http://t.co/zJTZO9uenm
3,#PrayforKathmandu a man being rescued after a massive #Earthquake in #Nepal #NepalEarthquake all we can do is #Pray http://t.co/VmhYEu2Fe7
4,"7.5 magnitude earthquake hits Nepal, tremors felt in India, Pakistan\nThe quake hit around noon on Saturday and... http://t.co/dmJqEhdUr5"
5,"BBC News - Nepal earthquake: Hundreds die, many feared trapped http://t.co/HnxkT0GbaL"
6,"MORE: Nepal police say at least 1,130 dead in massive earthquake centered outside of Kathmandu: http://t.co/SaRRmdOzgs"
7,GoI did a good job. The arrangements were good too: Anand Rao (Indian rescued from Nepal #earthquake) http://t.co/sQEGrnu6Hh
8,Israel News | At least 450 died in a Nepal earthquake - JerusalemOnline http://t.co/AmxGDYp0ZZ
9,Praying for everyone affected by the earthquake in Nepal


---

# Word Embeddings

## Average embedding

In [193]:
from pathlib import Path
from gensim.models import KeyedVectors

we = Path('/home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec')
model = KeyedVectors.load_word2vec_format(we.as_posix())

2017-12-18 16:33:28,552 | summa.preprocessing.cleaner | INFO : 'pattern' package not found; tag filters are not available for English
2017-12-18 16:33:28,563 | gensim.models.keyedvectors | INFO : loading projection weights from /home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec
2017-12-18 16:34:42,727 | gensim.models.keyedvectors | INFO : loaded (1076139, 100) matrix from /home/mquezada/phd/multimedia-summarization/data/word_embeddings/ft_alltweets_model.vec


### DB Tweets

In [246]:
event_id = 8

event = db.get_documents(event_id, full=False)
tweets = np.array(list(map(lambda x: ' '.join(x[0].text.split()), event.values())))
#tweets = np.array(list(event.values()))

In [239]:
selected = np.random.choice(tweets, 100)

In [247]:
pd.DataFrame(tweets).to_csv('texts.tsv', sep='\t', header=False, index=False)

In [251]:
vocabs = []

for tweet_list in tqdm(selected):
    tweet_vocab = set()
    for tweet in tweet_list:
        for token in tokenizer.tokenize(tweet.text):
            tweet_vocab.add(token)
    vocabs.append(tweet_vocab)

  0%|          | 0/61 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'text'

### Tweets escogidos a mano

In [311]:
with open('libya_selected.tsv') as f:
    labels, selected = [], []
    for line in f:
        tokens = line.split()
        labels.append(tokens[0])
        selected.append(' '.join(tokens[1:]))

selected[0]

'Gunmen possibly linked to Islamic State attack hotel popular with foreigners in Libyan capital Tripoli -officials http://t.co/dNG1ykqJi0'

In [312]:
vectors = []
texts = []
for label, tweet in tqdm(zip(labels, selected)):
    vector = []
    for token in tokenizer.tokenize(tweet):
        if token in model:
            vector.append(model[token])
    vector = np.mean(vector, axis=0)
    if not np.isnan(vector).any():
        vectors.append(vector)
        texts.append((tweet, label))


61it [00:00, 151.15it/s]


In [313]:
print(texts[0])

('Gunmen possibly linked to Islamic State attack hotel popular with foreigners in Libyan capital Tripoli -officials http://t.co/dNG1ykqJi0', '1')


In [314]:
pd.DataFrame(texts).to_csv('libya_labels_avg.tsv', sep='\t', header=False, index=False)
pd.DataFrame(vectors).to_csv('libya_vectors_avg.tsv', sep='\t', header=False, index=False)

---

# Discourse Vectors

In [281]:
from collections import Counter

freqs = Counter()
with open('/home/mquezada/phd/multimedia-summarization/data/word_embeddings/wordfrequencies_relative.tsv') as f:
    for line in f:
        word, freq = line.split()
        freqs[word] = float(freq)

In [315]:
# discourse

def discourse(labels, texts, alpha):
    vectors = list()
    final_labels = list()
    final_texts = list()
    
    for label, text in tqdm(zip(labels, texts)):
        tweet_vector = []
        for token in tokenizer.tokenize(text):
            if token in model and token in freqs:
                vector = model[token]
                prob = freqs[token]
                tweet_vector.append((alpha / (alpha + prob)) * vector)

        if tweet_vector:
            vectors.append(np.mean(tweet_vector, axis=0))
            final_labels.append(label)            
            final_texts.append(text)
            
    #pca = PCA(n_components=1)
    #pca.fit(np.array(list(vectors.values())))
    #u = pca.components_
    
    final_vectors = []
    for vector in tqdm(vectors):
        final_vectors.append(vector) # - u.T.dot(u).dot(vector))
        
    return final_labels, final_texts, final_vectors
            

In [316]:
disc_l, disc_t, disc_v = discourse(labels, selected, 0.001)

61it [00:00, 162.11it/s]
100%|██████████| 61/61 [00:00<00:00, 491079.74it/s]


In [304]:
disc_l

['1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '2',
 '3',
 '3',
 '3',
 '3',
 '3',
 '3',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '4',
 '5',
 '5',
 '5',
 '5',
 '5',
 '5',
 '6']

In [317]:
texts = []
rows = []
for label, text, vector in tqdm(zip(disc_l, disc_t, disc_v)):
    row = []
    for dim in vector:
        row.append(dim)
    rows.append(row)
    texts.append((text, label))
    
vectors = pd.DataFrame(rows)

pd.DataFrame(texts).to_csv('libya_labels_disc.tsv', sep='\t', header=False, index=False)
pd.DataFrame(vectors).to_csv('libya_vectors_disc.tsv', sep='\t', header=False, index=False)

#d.to_csv('sample_100_disc.tsv', sep='\t', header=False, index=False)

61it [00:00, 33928.20it/s]


In [92]:
#sorted(freqs.items(), key=lambda x: x[1])

with open('freqs.tsv', 'w') as f:
    for w, fr in freqs.items():
        f.write(f'{repr(w)}\t{fr}\n')

# Datos completos

## Libya hotel

Todos los tweets

In [323]:
event2 = db.get_tweets(event_id)
tweets = np.array(list(map(lambda t: t.text, event2)))

tweets.shape

(28640,)

Remover duplicados:

In [325]:
texts = [' '.join([token for token in tokenizer.tokenize(t)]) for t in tweets]       

In [327]:
texts_set = set(texts)

In [329]:
print(len(texts))
print(len(texts_set))

28640
6975


In [331]:
vectors = []
final_texts = []
for t in tqdm(texts_set):
    vector = [model[token] for token in t.split() if token in model]
    if vector:
        vectors.append(np.mean(vector, axis=0))
        final_texts.append(t)

100%|██████████| 6975/6975 [00:00<00:00, 24017.23it/s]


In [332]:
print(len(vectors))
print(len(final_texts))

6972
6972


In [333]:
pd.DataFrame(final_texts).to_csv('libya_labels_avg_full.tsv', sep='\t', header=False, index=False)
pd.DataFrame(vectors).to_csv('libya_vectors_avg_full.tsv', sep='\t', header=False, index=False)


discourse:

In [336]:
vectors = list()
final_texts = list()

alpha = 0.001

for text in tqdm(texts_set):
    tweet_vector = []
    for token in text.split():
        if token in model and token in freqs:
            vector = model[token]
            prob = freqs[token]
            tweet_vector.append((alpha / (alpha + prob)) * vector)

    if tweet_vector:
        vectors.append(np.mean(tweet_vector, axis=0))
        final_texts.append(text)

pd.DataFrame(final_texts).to_csv('libya_labels_disc_full.tsv', sep='\t', header=False, index=False)
pd.DataFrame(vectors).to_csv('libya_vectors_disc_full.tsv', sep='\t', header=False, index=False)


100%|██████████| 6975/6975 [00:00<00:00, 17208.63it/s]


avg fasttext con los documentos:

In [342]:
final_texts, vectors = [], []

with open('texts.tsv') as f:
    for line in f:
        vector = [model[token] for token in tokenizer.tokenize(line) if token in model]
        if vector:
            final_texts.append(line)
            vectors.append(np.mean(vector, axis=0))
        

In [343]:
pd.DataFrame(final_texts).to_csv('libya_labels_avg_docs.tsv', sep='\t', header=False, index=False)
pd.DataFrame(vectors).to_csv('libya_vectors_avg_docs.tsv', sep='\t', header=False, index=False)
