### Retrieval tool

`~/CrisisMMD/TweetsRetrievalTool-v2.0`

two files in the dir: tweet_ids.txt, twitter.properties

exec `java -classpath TweetsRetrievalTool-2.0.jar qa.org.qcri.tweetsretrievaltool.TweetsRetrievalTool`

Irma Hurricane

```
1418517 tweet2.json
1700788 tweet_ids.txt
```


In [20]:
from pathlib import Path
import url_expander
import spacy
import json
from tqdm import tqdm
import requests
import multiprocessing
import time
import datetime
import asyncio
import aiohttp
from typing import List
import logging
import os
from collections import namedtuple
import re
from unionfind import UnionFind
import numpy as np
from gensim.models import KeyedVectors
import pickle as pkl

# Resolve URLS

In [None]:
!pwd

In [None]:
base = Path('/home/mquezada/CrisisMMD/full_tweets/data/tweet_ids_v1/')

In [None]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])

In [None]:
%%time

texts = []

with (base / Path('irma_tweets.json')).open() as f:
    for line in tqdm(f):
        tw = json.loads(line)
        texts.append(tw['full_text'])

In [None]:
%%time

urls = []

for text in tqdm(nlp.pipe(texts, n_threads=-1, batch_size=10000), total=len(texts)):
    for token in text:
        if token.like_url:
            urls.append(token.text)

In [None]:
urls_path = base / Path('irma_urls.txt')
with urls_path.open('w') as f:
    for u in urls:
        f.write(u + '\n')

In [None]:
#logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', level=logging.ERROR)

def resolve_url(short_url):
    try:
        response = requests.head(short_url, allow_redirects=True)
        #logging.info('Done: {} ({})'.format(response.url, response.status_code))
        pid =  os.getpid()
        
        with (base / Path(f'resolved_urls/urls_full_{pid}.txt')).open('a') as f:
            f.write(f'{short_url}\t{response.url}\t{response.status_code}\n')
            
        return short_url, response.url, response.status_code
    
    except Exception as e:
        logging.error(str(e))
    
    return short_url, None, None

In [None]:
%%time 

p = multiprocessing.Pool(processes=20)
result = p.map(resolve_url, urls)
#p.terminate()

# Load URLs and Tweets for Irma

In [2]:
base_urls = Path('/home/mquezada/CrisisMMD/full_tweets/data/tweet_ids_v1/resolved_urls/').glob('*.txt')

short_expanded = dict()

for fn in base_urls:
    with fn.open() as f:
        for line in f:
            short, expanded, _ = line.split('\t')
            short_expanded[short] = expanded
            
len(short_expanded)

385055

In [3]:
Tweet = namedtuple('Tweet', 'tweet_id retweet_id quote_id reply_id short_urls expanded_urls text created_at')
url_re = re.compile(r'(https?://t.co/[a-zA-Z0-9]+)')
hashtag_re = re.compile(r'(#[a-zA-Z0-9]+)')

In [4]:
base_tweet = Path('/home/mquezada/CrisisMMD/full_tweets/data/tweet_ids_v1/irma_tweets.json')

event_data = dict()
ignored_amount = 0
missing_urls = 0

with base_tweet.open() as f:
    for line in tqdm(f, total=1418517):
        t = json.loads(line)
        text = t['full_text']
        
        urls_in_tweet = url_re.findall(text)
        n_hashtags = hashtag_re.findall(text)
        
        if len(n_hashtags) >= 4 or len(urls_in_tweet) >= 3:
            ignored_amount += 1
            continue
            
        expanded_map = dict()
        for i, short_url in enumerate(urls_in_tweet):
            expanded_url = short_expanded.get(short_url)
            expanded_map[i] = expanded_url
            
            if not expanded_url:
                missing_urls += 1            
        
        rt = t.get('retweeted_status')
        if rt:
            rt = rt.get('id')
        qt = t.get('quoted_status_id')
        rp = t.get('in_reply_to_status_id')        
        
        tweet = Tweet(
            str(t['id']),
            str(rt),
            str(qt),
            str(rp),
            urls_in_tweet,
            expanded_map,
            text,
            t['created_at']
        )
        event_data[tweet.tweet_id] = tweet

100%|██████████| 1418517/1418517 [02:06<00:00, 11212.43it/s]


In [6]:
print('tweets', len(event_data))
print('missing urls', missing_urls)
print('ignored', ignored_amount)

tweets 1372366
missing urls 52178
ignored 46151


In [7]:
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
_info = logging.info

def gen_model():    
    _info("create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'")
    replies_amount = 0
    retweets_amount = 0
    quotes_amount = 0
    missing_replies_amount = 0
    pairs = []
    for tweet_id, tweet in event_data.items():
        [pairs.append((tweet_id, url)) for url in tweet.expanded_urls.values() if url]

        # retweets ARE considered, due to be exact text copies of the retweeted tweet
        if tweet.retweet_id != 'None':
            retweets_amount += 1
        if tweet.quote_id != 'None':
            quotes_amount += 1
        if tweet.reply_id != 'None':
            replies_amount += 1
            if tweet.reply_id in event_data:
                pairs.append((tweet_id, tweet.reply_id))
            else:
                missing_replies_amount += 1
    _info(f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} '
          f'(missing: {missing_replies_amount})')

    ##########

    """
        all keys must be the same time (in this case, strings);
        unionfind will vectorize operations and will cast everything in the array to the same type,
        so if there are integers and strings, it will cast everything to string and comparisons will fail
        when calling uf.components().
    """

    _info('applying union-find')
    uf = UnionFind()
    for u, v in pairs:
        uf.union(u, v)
    _info(f'total components: {len(uf.components())}')

    return uf

uf = gen_model()

2018-10-29 16:42:02,420 : create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'
2018-10-29 16:42:03,417 : total pairs: 819914, retweets: 960528, quotes: 427409, replies: 41154 (missing: 37042)
2018-10-29 16:42:03,417 : applying union-find
2018-10-29 16:43:47,119 : total components: 198912


In [8]:
components = uf.components()

In [18]:
nlp = spacy.load('en', disable=["tagger", "parser", "ner"])
we = KeyedVectors.load_word2vec_format('/home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec')

2018-10-29 16:49:46,874 : loading projection weights from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec
2018-10-29 16:50:46,685 : loaded (1076139, 100) matrix from /home/mquezada/anchor-text-twitter/data/ft_alltweets_model.vec


In [15]:
docs = dict()

for component in tqdm(components):
    comp_key = []
    comp_ids = []
    
    for elem in component:
        if elem.startswith('http'):
            comp_key.append(elem)
        else:
            t = event_data.get(elem)
            if not t:
                print("err")
            comp_ids.append(t.tweet_id)

    # component does not have url
    if not comp_key:
        comp_key.append(np.random.choice(comp_ids))

    docs[tuple(comp_key)] = comp_ids

100%|██████████| 198912/198912 [00:00<00:00, 224046.50it/s]


In [19]:
vecs = dict()

# for each component in this event
for urls, tweet_ids in tqdm(docs.items(), total=len(docs)):
    vec = []
    texts = [event_data[twid].text for twid in tweet_ids]

    for tokens in nlp.pipe(texts, n_threads=-1):
        for token in tokens:
            if not token.like_url and token.lower_ in we:
                v = we[token.lower_]
                vec.append(v)

    if vec:
        avg_vec = np.array(vec).mean(axis=0)
        vecs[urls] = avg_vec

100%|██████████| 198912/198912 [02:55<00:00, 1135.51it/s]


In [21]:
save_path = Path('/home/mquezada/CrisisMMD/full_tweets/data/tweet_ids_v1/')
vecs_fn = save_path / Path('irma_full_ft_vectors_from_model.pkl')
uf_fn = save_path / Path('irma_full_uf_from_model.pkl')
event_data_fn = save_path / Path('irma_full_event_data.pkl')


with vecs_fn.open('wb') as f:
    pkl.dump(vecs, f)
    
with uf_fn.open('wb') as g:
    pkl.dump(uf, g)
    
with event_data_fn.open('wb') as k:
    pkl.dump(event_data, k)

In [22]:
with (save_path / Path('irma_full_ft_vectors_from_model.tsv')).open('w') as f:
    for url_key, vec in vecs.items():
        key = '"' + ",,,".join(url_key) + '"'
        values = "\t".join([str(v) for v in vec])
        f.write(f"{key}\t{values}\n")