## Import Twitter data using Tweepy

In [40]:
## Import Tweepy to analyze Twitter data with Python
import tweepy
from string import punctuation
## Import the Natural Language Toolkit
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

## Connect to Twitter Data using the Twitter API

In [29]:
# Connect Jupyter to your Crowdmark API key.
# Please place your Crowdmark API key somewhere and link to it by adjusting the route below.
# The API key allows the computer hosting your Jupyter notebook to programmatically access data from Crowdmark.
with open("/home/jcollian/.twitter-keys", 'r') as f:
    Twitter_Keys = f.read()
# apiKey

In [32]:
type(Twitter_Keys)

str

Hack: Define a dictionary with `keys = {...}` using the content from Twitter_Keys. The dictionary will populate your Twitter keys below.

In [33]:
# == OAuth Authentication ==
#
# This mode of authentication is the new preferred way
# of authenticating with Twitter.
# Source: https://github.com/tweepy/tweepy/blob/master/examples/oauth.py

# The consumer keys can be found on your application's Details
# page located at https://dev.twitter.com/apps (under "OAuth settings")

consumer_key=keys[consumer_key]
consumer_secret= keys[consumer_secret]

# The access tokens can be found on your applications's Details
# page located at https://dev.twitter.com/apps (located
# under "Your access token")
access_token=keys[access_token]
access_token_secret=keys[access_token_secret]

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.secure = True
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

# If the authentication was successful, you should
# see the name of the account print out
print(api.me().name)

James Colliander


## Select the Twitter Account to Analyze

In [61]:
screenName = 'pimsmath'
tweetsPerRetrieval = 50

# return list of Status object instances
data = api.user_timeline(
    screen_name=screenName,
    count=tweetsPerRetrieval,
    tweet_mode='extended', # avoid truncation
    include_rts=True # include retweets
)

In [62]:
# Collect recent tweets
tweets = [] + data # extend first tweets
lastIndex = tweets[-1].id - 1 # get last tweet index

# recover tweets
while len(data) > 0:
    data = api.user_timeline(
        screen_name=screenName,
        count=tweetsPerRetrieval,
        max_id=lastIndex, # prevent duplicates
        tweet_mode='extended',
        include_rts=True,
    )
    tweets += data
    lastIndex = tweets[-1].id - 1
print('Retrieved %d tweets!' % len(tweets))

Retrieved 1581 tweets!


In [63]:
# isolate mentions, get names
mentions = [m['screen_name']
                for t in tweets
                    for m in t.entities['user_mentions']]
mentions[:5] # slice first five

['colliand', 'UBCStatistics', 'UBC', 'UBCDSI', 'uvic']

In [71]:
# isolate media, get each URL
remedia = [m['media_url']
               for t in tweets
                   if hasattr(t, 'retweeted_status')
                   and 'media' in t.retweeted_status.entities
                       for m in t.retweeted_status.entities['media']]

media = remedia + [m['media_url']
                       for t in tweets
                           if 'media' in t.entities 
                               for m in t.entities['media']]
media[:10]

['http://pbs.twimg.com/media/DYvkYukWAAAnHzS.jpg',
 'http://pbs.twimg.com/media/DYv6eUVX0AAzNv2.jpg',
 'http://pbs.twimg.com/media/DYa1hn2VMAAx855.jpg',
 'http://pbs.twimg.com/media/DX4JltvWsAMQzNN.jpg',
 'http://pbs.twimg.com/media/DXx8nhZVAAA7aDP.jpg',
 'http://pbs.twimg.com/media/DXjpUuMUMAALaox.jpg',
 'http://pbs.twimg.com/media/DXD5dcEVoAAe0Uv.jpg',
 'http://pbs.twimg.com/media/DWaa1OEVoAAeJ1j.jpg',
 'http://pbs.twimg.com/media/DWa3Q-zUMAAF-tF.jpg',
 'http://pbs.twimg.com/media/DWGNfuNX4AA4C6Z.jpg']

In [70]:
# prefer retweet data, get tweet text
text = [t.retweeted_status.full_text
            if hasattr(t, 'retweeted_status')
            else t.full_text
                for t in tweets]
text[:10]

['A poster session showcasing work in @UBCStatistics will take place @UBC next Tuesday. https://t.co/I34ZfHY1Ie FYI @UBCDSI @ubcprez @pimsmath @ubcmath',
 'Today at @uvic, a PIMS-UVic Distinguished Lecture from Leslie Hogben of @IowaStateU &amp; @AIMathematics:\n"The Inverse Eigenvalue Problem of a Graph"\nLecture details here: https://t.co/uskOJS0BKD\n@pimsmath @UVicScience https://t.co/dO4JjH45QE',
 'Congratulations! Robert Langlands, who developed one of the most original insights of 20th-century #mathematics, was named the winner of the 2018 @abel_prize at a ceremony in Norway this morning. https://t.co/ddwAk3kJPj https://t.co/2rV4Z5zy1p',
 'PIMS lecture: Troy Day: tomorrow at 4 p.m. in Robert Schultz Theatre #StatsUmanSeminar https://t.co/8lX0rPjgd6',
 'Learn where Mathematics meets Evolutionary Biology tomorrow at PIMS @umanitoba in a Distinguished Lecture with Troy Day of @queensu \n"The Mathematics of Social Evolution"\nLecture details here: https://t.co/V1d13zKOGA\n@pimsmath @

In [72]:
# tokenize words
tokens = []
for t in text:
    for w in t.replace('\n', '').lower().split(' '): # remove newlines, split on spaces
        try:
            if (
                not (w.startswith('http') or w.startswith('@')) # ignore if URL or mention
                and w not in stopwords.words('english') # ignore stopwords
                and str(w).translate(punctuation) != '' # ignore punctuation
                and w != u''): # ignore empties
                    tokens.append(str(w).translate(punctuation))
        except UnicodeEncodeError: pass # ignore if not ascii
tokens[:10]

['poster',
 'session',
 'showcasing',
 'work',
 'take',
 'place',
 'next',
 'tuesday.',
 'fyi',
 'today']

In [75]:
# get word frequency
freq = dict((t, tokens.count(t)) for t in tokens)
sorted(freq, key=freq.get)[-20:] # slice frequent words

['education',
 'available',
 'workshop',
 'school',
 'event',
 'summer',
 'week',
 'mathematics',
 'here:',
 'seminar',
 '#ubc:',
 'distinguished',
 '-',
 'w/',
 'today',
 'lecture',
 'new',
 '&amp;',
 'math',
 'pims']