# Twitter data part 2

The API basics can be found in [Part 1](./twitter_api_tweet_analysis.ipynb)

<div class=note><b>Copyright and Licensing:</b>


You are free to use or adapt this notebook for any purpose you'd like. However, please respect the [Simplified BSD License](https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/LICENSE.txt) that governs its use.</div>

In [None]:
# %load ../_data/standard_import.txt

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import twitter

plt.style.use('seaborn-white')

In [None]:
pickle_file = '../_credentials/twitter_credentials.pkl'
Twitter=pickle.load(open(pickle_file,'rb'))

#### Authorizing an application to access Twitter account data

In [None]:
auth = twitter.oauth.OAuth(Twitter['Access Token'],
                           Twitter['Access Token Secret'],
                           Twitter['Consumer Key'],
                           Twitter['Consumer Secret'])

twitter_api = twitter.Twitter(auth=auth)

#### Create Yahoo! Where On World ID dictionary

The Yahoo! Where On Earth ID for the entire world is 1.  
[Find your WOE ID](http://woeid.rosselliot.co.nz)

In [None]:
def get_woeid(locations):
    import requests
    from bs4 import BeautifulSoup
    woeids = {}
    if type(locations) != type([]): 
        locations = [locations]
    for loc in locations:
        try:
            response = requests.get('http://woeid.rosselliot.co.nz/lookup/' + loc)
            results_page = BeautifulSoup(response.content, 'lxml')
            woeids[loc] = int(results_page.find_all('td', {"class": "woeid"})[0].text)
        except:
            continue
    return woeids

In [None]:
LOCATIONS = ['World', 'United States', 'Netherlands', 'Europe', 'Amsterdam', 'Abcoude']
WOEID = get_woeid(LOCATIONS)

In [None]:
for loc in LOCATIONS:
    print(loc, WOEID[loc])

### Trends by tweet volume

In [None]:
def woe_trends_(location):
    """Returns localised trends"""
    WOEID = get_woeid(location)
    local_trends = twitter_api.trends.place(_id=WOEID[location])[0]['trends']
    return (pd.DataFrame([(trend['name'],trend['tweet_volume']) 
                         for trend in local_trends], 
                         columns=['trend', 'volume'])
                         .sort_values('volume', ascending=False)
                         .set_index('trend'))

In [None]:
woe_trends_('Amsterdam').head(10)

### Locality of trends

In [None]:
def trending_location(location):
    """Returns localised trends"""
    WOE_ID = get_woeid(location)
    local_trends = twitter_api.trends.place(_id=WOE_ID[location])[0]['trends']
    return set([trend['name'] for trend in local_trends])

In [None]:
def trending_shared(locations):
    """Returns trends shared between WOE_ID_1 & WOE_ID_2"""
    WOE_ID = get_woeid(locations)
    local_trends_1 = twitter_api.trends.place(_id=WOE_ID[locations[0]])[0]['trends']
    local_trends_2 = twitter_api.trends.place(_id=WOE_ID[locations[1]])[0]['trends']
    set_1 = set([trend['name'] for trend in local_trends_1])
    set_2 = set([trend['name'] for trend in local_trends_2])
    return list(set_1 & set_2)

In [None]:
def trending_excl(locations):
    """Returns trends only in WOE_ID_1"""
    WOE_ID = get_woeid(locations)
    local_trends_1 = twitter_api.trends.place(_id=WOE_ID[locations[0]])[0]['trends']
    local_trends_2 = twitter_api.trends.place(_id=WOE_ID[locations[1]])[0]['trends']
    set_1 = set([trend['name'] for trend in local_trends_1])
    set_2 = set([trend['name'] for trend in local_trends_2])
    return list((set_1 ^ set_2) & set_1)

In [None]:
trends_set = {}
trends_set['world'] = trending_location('World')
trends_set['nl'] = trending_location('Netherlands')
trends_set['amsterdam'] = trending_location('Amsterdam')
trends_set['nl&ams'] = trending_shared(['Netherlands', 'Amsterdam'])
trends_set['ams^nl'] = trending_excl(['Amsterdam', 'Netherlands'])

In [None]:
trends_set['ams^nl']
trends_set['ams^nl'][0]
trends_set['nl&ams'][0]

In [None]:
trending_shared(['Netherlands', 'Germany'])

### Collecting search results

Set the variable `q` to a trending topic, 
or anything else for that matter. The example query below
was a trending topic when this content was being developed
and is used throughout the remainder of this chapter

[api docs](https://dev.twitter.com/docs/api/1.1/get/search/tweets)

In [None]:
def recursive_nodes(x, **kwargs):
    """"""
    key = kwargs.get('key', None)
    y = kwargs.get('y', None)
    
    # base case
    if y is None: y = []
    if x == []: return y
    
    # recursive call
    if key is None: 
        y.append(x[0])
#         recursive_nodes(x[1:], None, y)
    else: 
        y.append(x[0][key])
    recursive_nodes(x[1:], key=key, y=y)
    return y

In [None]:
assert recursive_nodes([1,2,3,4]) == [1, 2, 3, 4]

In [None]:
def trending_topic(topic, number=100):
    """Returns status"""
    return twitter_api.search.tweets(q=topic, count=number)['statuses']

In [None]:
def trending_topic(topic, number=100):
    """Returns status"""
    return twitter_api.search.tweets(q=topic, count=number)['statuses']

In [None]:
def trending_text(topic, number=100, head=10):
    """Returns status text"""
    tweet_ = [(s['user']['screen_name'], recursive_nodes(s['entities']['user_mentions'], key='screen_name'),
               recursive_nodes(s['entities']['hashtags'], key='text'), s['text'], s['retweet_count']) 
              for s in trending_topic(topic)]
    return pd.DataFrame(tweet_, columns=['name', 'mentions', 'hashtags', 'text', 'retweets']).sort_values('retweets', ascending=False)

In [None]:
topic = trends_set['ams^nl'][0]
statuses = trending_topic(topic)
statuses[0].keys()

In [None]:
statuses[0]#['geo']

In [None]:
df = trending_text(topic, number=100, head=10)
df.head(100)

In [None]:
df.mentions[:10]
df.hashtags[:10]

### Extracting text, screen names, and hashtags from tweets

In [None]:
# status_texts = [status['text'] for status in statuses]

# screen_names = [user_mention['screen_name'] for status in statuses
#                                             for user_mention in status['entities']['user_mentions']]

# hashtags = [hashtag['text'].lower() for status in statuses
#                             for hashtag in status['entities']['hashtags']]
stopwords = ['de', 'het', 'een', 'is', 'de', 'die', 'dat', 'dit', 'van', 'en', 'rt', 'in', 'er', 'op', 'als', 'aan', 'als', 'bij',
             'met', 'niet', 'voor', 'gaat', 'ze', 'je', 'ik', 'wij', 'rt', 'staan', 'kan', 'dan', 'af', 'zoals', 'laat', 'naar',
             'meer', 'werd', 'geen', 'na', 'heeft', 'komt', 'wel', 'nog', 'over', '-']

# Compute a collection of all words from all tweets
words = [w.lower() for t in df['text']
           for w in t.split() if w.lower() not in stopwords]
Counter(words).most_common(20)

In [None]:
status_texts[0:5]

In [None]:
# Explore the first 5 items for each...
print('status text: ', json.dumps(status_texts[0:5], indent=1))
print('screen names: ', json.dumps(screen_names[0:5], indent=1)) 
print('hashtags: ', json.dumps(hashtags[0:5], indent=1))
print('words: ', json.dumps(words[0:5], indent=1))

### Basic frequency distribution from the words in tweets

In [None]:
from collections import Counter

for item in words: #[words, screen_names, hashtags]:
    c = Counter(item)
    print('-'*80)
    print(c.most_common()[:10]) # top 10
    

In [None]:
pd.DataFrame(Counter(words).most_common(30), columns=['word', 'count']).set_index('word').head()

In [None]:
pd.DataFrame(Counter(screen_names).most_common(30), columns=['mentions', 'count']).set_index('mentions').head()

In [None]:
pd.DataFrame(Counter(hashtags).most_common(30), columns=['hashtags', 'count']).set_index('hashtags').head()

### Most popular retweets

In [None]:
retweets = [
            # Store out a tuple of these three values ...
            (status['retweet_count'], 
             status['retweeted_status']['user']['screen_name'],
             status['text'].replace("\n","\\")) 
            
            # ... for each status ...
            for status in statuses 
            
            # ... so long as the status meets this condition.
                if 'retweeted_status' in status
           ]

In [None]:
df_retweets = pd.DataFrame(retweets, columns=['retweets', 'screen_name', 'text']).sort_values('retweets', ascending=False)

In [None]:
df_retweets.head()

In [None]:
df_retweets.text[16]