In [7]:
import twitter
import yaml
import json
import urllib

import pandas as pd
import numpy as np
from collections import Counter
from pprint import pprint

from IPython.core.display import HTML
from IPython.display import display

credentials = yaml.load(open('./api_cred.yml'))
auth = twitter.oauth.OAuth(credentials['OAUTH_TOKEN'], 
                           credentials['OAUTH_TOKEN_SECRET'],
                           credentials['CONSUMER_KEY'],
                           credentials['CONSUMER_SECRET'])
twitter_api = twitter.Twitter(auth=auth)

In [8]:
num_tweets = 500
max_tweetsallowed = 150
n = num_tweets/max_tweetsallowed
prev_id=''
labels = ['screen_name', 'timestamp', 'friends_count', 'hashtags', 'usermentions', 'retweet_count', 'favorite_count', 'lang', 'text']
df  = pd.DataFrame(columns=labels)
for x in range (0, n):
    search_results = twitter_api.search.tweets(q='#panamapapers', count=max_tweetsallowed, max_id=prev_id)
    statuses = search_results['statuses']
    # TBD: extract into JSON file
    # iterate through statuses and extract information
    for s in statuses: 
        screen_name = s['user']['screen_name'] 
        timestamp = s['created_at']
        friends_count = s['user']['friends_count']
        user_mentions = [user_mention['screen_name'] for user_mention in s['entities']['user_mentions']]
        hashtags = [hashtag['text'] for hashtag in s['entities']['hashtags']]
        words = [w for w in s['text'].split()]
        text = s['text']
        lang = s['lang']
        retweet_count = s['retweet_count']
        favorite_count = s['favorite_count']
        # transform any list data into text
        user_mentions_text = ','.join(user_mentions)
        hashtags_text = ','.join(hashtags)
        words_text = ','.join(words)
        # create a dataframe for each transformed status update
        df = df.append({'screen_name': screen_name, 'timestamp': timestamp, 'friends_count': friends_count,
                        'hashtags':hashtags_text, 'usermentions':user_mentions_text,
                        'retweet_count':retweet_count, 'favorite_count':favorite_count,
                        'lang':lang,'text': text, 'words': words_text},ignore_index=True)                    
    prev_id = statuses[-1]['id']-1
print len(df)
df.head()

300


Unnamed: 0,screen_name,timestamp,friends_count,hashtags,usermentions,retweet_count,favorite_count,lang,text,words
0,LeCourvois,Sat Apr 09 20:19:04 +0000 2016,662,panamapapers,maxkeiser,104,0,en,RT @maxkeiser: Only 14 hours till UK media pum...,"RT,@maxkeiser:,Only,14,hours,till,UK,media,pum..."
1,fadopazo,Sat Apr 09 20:19:04 +0000 2016,1614,PanamaPapers,oxfam_es,16,0,es,RT @oxfam_es: Los países de ALC no recaudan su...,"RT,@oxfam_es:,Los,países,de,ALC,no,recaudan,su..."
2,Maestro_Yodo,Sat Apr 09 20:19:02 +0000 2016,431,panamapapers,,0,0,fr,#panamapapers \nCharlie Hebdo https://t.co/LxN...,"#panamapapers,Charlie,Hebdo,https://t.co/LxNtJ..."
3,4AllSoulKind,Sat Apr 09 20:19:02 +0000 2016,148,panamapapers,dunno_nuffing,9,0,en,RT @dunno_nuffing: OMG Putin did it\nOMG Soros...,"RT,@dunno_nuffing:,OMG,Putin,did,it,OMG,Soros,..."
4,RemyKyd,Sat Apr 09 20:19:01 +0000 2016,675,PanamaPapers,ianbremmer,1026,0,en,RT @ianbremmer: The circle just keeps growing\...,"RT,@ianbremmer:,The,circle,just,keeps,growing,..."


In [9]:
# save file to csv
print df.columns
print len(df)
df.to_csv("data/panamapapers.tsv", sep="\t", encoding='utf-8')
df.to_json("data/panamapapers.json", orient='records')

Index([u'screen_name', u'timestamp', u'friends_count', u'hashtags',
       u'usermentions', u'retweet_count', u'favorite_count', u'lang', u'text',
       u'words'],
      dtype='object')
300


In [11]:
for item,title in [(df.words, "Words"), (df.screen_name, "Screen Names @"), (df.hashtags, "Hashtags #")]:
    c = Counter(item)
    df = pd.DataFrame(c.most_common()[:10]).T 
    df.index = ["Text", "count"]
    print "Top 10 Most Common {0}:".format(title)
    display(df) 

Top 10 Most Common Words:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Text,"RT,@Snowden:,#Iceland's,protest,photos,look,li...","RT,@ianbremmer:,The,best,explanation,of,the,we...","RT,@bbcmundo:,#PanamaPapers:,6,formas,en,las,q...","RT,@dunno_nuffing:,OMG,Putin,did,it,OMG,Soros,...","RT,@sigridurtul:,Police,estimates,that,5,500,p...","RT,@YourAnonCentral:,#PanamaLeaks,#PanamaPaper...","RT,@bbcmundo:,Cómo,se,sienten,los,panameños,po...","RT,@RTUKnews:,'Cameron:,We're,all,in,this,toge...","RT,@jeremycorbyn:,I,was,interviewed,on,Sky,abo...","RT,@PeriodismoLibre:,#URGENTE:,Miles,de,britán..."
count,15,7,6,5,5,5,3,3,3,3


Top 10 Most Common Screen Names @:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Text,levantateCagonq,MMahmadmm,CresppSandra,JetLifeSec,jeffrey_ventre,7575sd,qennin53,ENDIKA59,mfamairani,NuytsPaul
count,3,3,2,2,2,2,2,2,2,2


Top 10 Most Common Hashtags #:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Text,PanamaPapers,panamapapers,"Iceland,PanamaPapers",PanamáPapers,"Iceland,panamapapers","PanamaPapers,WeeklySpread,STPromo","URGENTE,PanamaPapers","PanamaLeaks,PanamaPapers,Refugees,Iceland,Resi...","Cameron,panamapapers,ResignDavidCameron",Panamapapers
count,81,69,16,7,6,6,5,5,3,3


In [15]:
# Hashtags most common inbetween 
query = twitter_api.search.tweets(q='#Refugees', count=5)
query_statuses = query['statuses']
query_hashtags = [ hashtag['text'] for status in query_statuses for hashtag in status['entities']['hashtags'] ]

query_hashtags_set = set([hashtag for hashtag in query_hashtags])
nflstream_hashtags_set = set([hashtag for hashtag in hashtags])
common_hashtags = query_hashtags_set.intersection(nflstream_hashtags_set)
common_hashtags

{u'PanamaPapers'}