In [11]:
from pathlib import Path
import json
import spacy

In [17]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])

In [12]:
wd = Path('/home/mquezada/CCRV/CCMR/CCMR_Twitter.txt')

with wd.open() as f:
    data = json.load(f)

In [7]:
# extract urls from texts

urls = []
for obj in data:
    text = obj.get('content')
    if not text:
        print(f"tweet_id {obj['tweet_id']}")
    tokens = nlp(text)
    for token in tokens:
        if token.like_url:
            urls.append(token.text)

In [9]:
len(urls), len(data)

(16455, 15629)

In [11]:
# write urls to expand

tmp = Path('/home/mquezada/news-model-git/news-model/data_ccmr/urls.txt')

with tmp.open('w') as f:
    for url in urls:
        f.write(url + '\n')
        
# expander: 17:37 28-08-2018

In [7]:
## load expanded urls
from pathlib import Path


expanded_urls = dict()
wd = Path('/home/mquezada/news-model-git/news-model/data_ccmr/expanded_urls.tsv')
with wd.open() as f:
    for line in f:
        short, exp, _ = line.split('\t')
        expanded_urls[short] = exp


In [9]:
# missing urls
missing = 0
for s, e in expanded_urls.items():
    if e == 'None':
        missing += 1

print("missing", missing)
print("total", len(expanded_urls))

missing 188
total 14323


In [73]:
data[0]

{'image_id': ['sandyA_fake_46'],
 'timestamp': 'Mon Oct 29 22:34:01 +0000 2012',
 'label': 1,
 'content': '¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN',
 'tweet_id': '263046056240115712',
 'event': 'sandy'}

In [152]:
from collections import defaultdict

docs = defaultdict(list)
labels = defaultdict(list)

event = 'sandy'

# extract urls from texts
# docs (high level)

for i, obj in enumerate(data):
    if obj.get('event') != event:
        continue
    text = obj.get('content')
    if not text:
        print(f"tweet_id {obj['tweet_id']}")
    tokens = nlp(text)
    for token in tokens:
        if token.like_url:
            url = token.text
            exp_url = expanded_urls.get(url)
            if exp_url:
                docs[exp_url].append(obj['content'])
                labels[exp_url].append(obj['label'])
            else:
                print(f"tweet_id {obj['tweet_id']} {url}")

In [136]:
len([d for d in data if d['event'] == event])

10222

In [141]:
docs

defaultdict(list,
            {'http://twitpic.com/b8lra6': ['¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN'],
             'https://twitter.com/TuiterHits/status/262977282765910016/photo/1': ['@milenagimon: Miren a Sandy en NY!  Tremenda imagen del huracán. Parece el "Día de la Independencia 2" http://t.co/41jUweux REAL! RT.',
              'IMPRESIONANTE: Pensaba q era alguna película de Hollywood, pero es una foto real del huracán #Sandy en Nueva York hoy ! http://t.co/zfy5ORHl',
              'IMPRESIONANTE: Pensaba q era alguna película de Hollywood, pero es una foto real del huracán #Sandy en Nueva York hoy ! http://t.co/V8c3hvRA',
              '@milenagimon:#Sandy en NY!  Tremenda imagen del huracán. Parece el "Día de la Independencia 2" http://t.co/41jUweux SOBRECOGEDOR! RT.'],
             'http://twitpic.com/b8k8vl': ['Buena la foto del Huracán Sandy, me recuerda a la película Día de la 

In [153]:
banned_urls = [None, 'None', 'https://twitter.com/account/suspended', 'http://t.co/']

counts = sorted([len(v) for k, v in docs.items() if k not in banned_urls], reverse=True)
counts[:25]

[94,
 84,
 80,
 61,
 59,
 58,
 57,
 46,
 44,
 43,
 42,
 39,
 38,
 37,
 36,
 36,
 33,
 32,
 31,
 31,
 30,
 30,
 29,
 25,
 25]

In [159]:
for b in banned_urls:
    docs.pop(b, None)
    labels.pop(b, None)

In [32]:
from gensim.models import KeyedVectors

we = KeyedVectors.load_word2vec_format('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')

In [155]:
import numpy as np

av_docs = list()
vecs = dict()

for j, (doc, texts) in enumerate(docs.items()):    
    doc_vecs = []
    for text in nlp.pipe(texts, n_threads=8):
        text_vec = [we[token.lower_] for token in text if not token.like_url and token.lower_ in we]
        doc_vecs.extend(text_vec)
        
    if len(doc_vecs) > 0:
        avg_vec = np.array(doc_vecs).mean(axis=0)
        vecs[doc] = avg_vec
        av_docs.append(j)
        

In [144]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=2)
km.fit(list(vecs.values()))

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [145]:
km.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [178]:
from collections import Counter
c = Counter(km.labels_)
print(c.most_common())

[(0, 5753), (1, 778)]


In [179]:
av_labels = dict()
for i, (k, v) in enumerate(labels.items()):
    if i in av_docs:
        av_labels[k] = v

print(len(km.labels_))
print(len(av_labels))

6531
6531


In [180]:
av_labels

{'http://twitpic.com/b8lra6': [1],
 'https://twitter.com/TuiterHits/status/262977282765910016/photo/1': [1,
  1,
  1,
  1],
 'http://twitpic.com/b8k8vl': [1, 1, 1, 1, 1],
 'https://www.instagram.com/p/RYKmm9shFg/': [1],
 'https://www.instagram.com/p/RYU9GDLIBg/': [1],
 'http://twitpic.com/b8usaz': [1],
 'https://www.instagram.com/p/RXrE9zkQRC/': [1],
 'https://www.instagram.com/p/RY_FO7seNB/': [1],
 'https://www.instagram.com/p/RYB9uON9LP/': [1],
 'https://www.instagram.com/p/RYGyM9w7VG/': [1],
 'https://www.instagram.com/p/RZHFt6powA/': [1],
 'https://www.instagram.com/p/RY1WQZsEsp/': [1],
 'https://www.instagram.com/p/RYIRgyrrX2/': [1],
 'https://www.instagram.com/p/RYslhrOwON/': [1],
 'https://www.instagram.com/p/RaHR2TK2td/': [1],
 'https://www.instagram.com/p/RYM5r1ydlQ/': [1],
 'https://www.instagram.com/p/RYm9oAjHVE/': [1],
 'https://www.instagram.com/p/RaOhYqBOcp/': [1],
 'https://www.instagram.com/p/RYRJ4tRmpk/': [1],
 'https://www.instagram.com/p/RX_exCRA1E/': [1],
 'https://

In [161]:
cluster_gt = defaultdict(list)

for (doc_url, gt_label), label in zip(av_labels.items(), km.labels_):
    #print(doc_url, gt_label, label)
    cluster_gt[label].extend(gt_label)


In [149]:
import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [171]:
y_true = []
y_pred = []

for (doc_url, gt_label), label in zip(av_labels.items(), km.labels_):
    y_pred.append(label)
    y_true.append(gt_label[0])

In [170]:
# check if any url has different labels in the corresp tweets

for u, l in av_labels.items():
    if not (sum(l) == 0 or sum(l) == len(l)):
        print(u, l)

In [174]:
purity_score(y_true, y_pred)

0.63589036900934

In [175]:
metrics.cluster.contingency_matrix(y_true, y_pred)

array([[2126,  252],
       [3627,  526]])