# Scraping Twitter with Python

There are a number different python libraries we could use, but for this example we will be using twython. The library contains convient methods to interact with twitters restful api.  
[Twython Documentation](https://twython.readthedocs.io/en/latest/index.html)  
[Twitter API Documentation](https://dev.twitter.com/rest/public)  
  


#### Import packages

In [1]:
from twython import Twython
import json
import pytz
from datetime import datetime

#### Authentication
This logs you in to you app and lets twitter keep track of you. You cannot use the api without authenticating. One reason twitter requires authentication is so that they can limit the number of requests you make in a given time period. If you are queerying a high volume of data, you will need to account for this limitation.  
Readabout it [here](https://dev.twitter.com/rest/public/rate-limiting)  
  
First, you need to create api keys to access , you can do this [here](https://apps.twitter.com). Then, you need to copy the app keys you created on twitters website into the variables below. 

In [2]:
APP_KEY = "your_app_key" 
APP_SECRET = "your_secret_key"

In [3]:
twitter = Twython(APP_KEY, APP_SECRET,oauth_version=2)
ACCESS_TOKEN = twitter.obtain_access_token()
twitter = Twython(APP_KEY, access_token=ACCESS_TOKEN)

#### Querying the garden hose

In [4]:
results = twitter.search(q="sports")

In [49]:
results.keys()

[u'search_metadata', u'statuses']

In [5]:
tweets=[]
for result in results["statuses"]:
    tweets.append(result)

In [6]:
len(tweets)

15

In [7]:
tweets

[{u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Thu Oct 27 21:05:50 +0000 2016',
  u'entities': {u'hashtags': [],
   u'symbols': [],
   u'urls': [{u'display_url': u'vine.co/v/MrTlImjlai1',
     u'expanded_url': u'https://vine.co/v/MrTlImjlai1',
     u'indices': [55, 78],
     u'url': u'https://t.co/VCSA5epUpK'}],
   u'user_mentions': [{u'id': 498177610,
     u'id_str': u'498177610',
     u'indices': [3, 16],
     u'name': u'Best Vines',
     u'screen_name': u'TheFunnyVine'}]},
  u'favorite_count': 0,
  u'favorited': False,
  u'geo': None,
  u'id': 791747760252973057,
  u'id_str': u'791747760252973057',
  u'in_reply_to_screen_name': None,
  u'in_reply_to_status_id': None,
  u'in_reply_to_status_id_str': None,
  u'in_reply_to_user_id': None,
  u'in_reply_to_user_id_str': None,
  u'is_quote_status': False,
  u'lang': u'en',
  u'metadata': {u'iso_language_code': u'en', u'result_type': u'recent'},
  u'place': None,
  u'possibly_sensitive': False,
  u'retweet_count': 893,

In [52]:
tweets[0].keys()

[u'contributors',
 u'truncated',
 u'text',
 u'is_quote_status',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'entities',
 u'retweeted',
 u'coordinates',
 u'source',
 u'in_reply_to_screen_name',
 u'in_reply_to_user_id',
 u'retweet_count',
 u'id_str',
 u'favorited',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'in_reply_to_status_id_str',
 u'place',
 u'extended_entities',
 u'metadata']

In [54]:
tweets[0]

{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Tue Oct 25 06:17:33 +0000 2016',
 u'entities': {u'hashtags': [{u'indices': [0, 7], u'text': u'Karate'},
   {u'indices': [41, 51], u'text': u'Tokyo2020'}],
  u'media': [{u'display_url': u'pic.twitter.com/BSoKz4P8v6',
    u'expanded_url': u'https://twitter.com/FirstpostSports/status/790799441485713408/photo/1',
    u'id': 790799249441038338L,
    u'id_str': u'790799249441038338',
    u'indices': [97, 120],
    u'media_url': u'http://pbs.twimg.com/media/Cvl7uj-UIAIYJuw.jpg',
    u'media_url_https': u'https://pbs.twimg.com/media/Cvl7uj-UIAIYJuw.jpg',
    u'sizes': {u'large': {u'h': 630, u'resize': u'fit', u'w': 1200},
     u'medium': {u'h': 630, u'resize': u'fit', u'w': 1200},
     u'small': {u'h': 357, u'resize': u'fit', u'w': 680},
     u'thumb': {u'h': 150, u'resize': u'crop', u'w': 150}},
    u'type': u'photo',
    u'url': u'https://t.co/BSoKz4P8v6'}],
  u'symbols': [],
  u'urls': [{u'display_url': u'bit.ly/2faR1vQ',
    

#### Get an accounts tweets

In [10]:
accounts = ["espn","foxsports","skepticalsports"]

In [12]:
acounts_tweets = []
for account in accounts:
    results=twitter.get_user_timeline(screen_name=account,cursor=-1,format="json")
    acounts_tweets.extend(results)

#### Get all tweets from an account from yesterday

In [13]:
def get_yesterday_tweets(handle):
    tweets=[]
    now=datetime.utcnow().date()
    today_date = datetime(now.year,now.month,now.day,0,0)
    yesterday_date = datetime(now.year,now.month,now.day-1,0,0)
    max_id=None
    while True:
        results=twitter.get_user_timeline(screen_name=handle,max_id=max_id,format="json")
        if len(results)>0:
            for result in results:
                tweet_date = datetime.strptime(result["created_at"],'%a %b %d %H:%M:%S +0000 %Y')
                if tweet_date < yesterday_date:
                    return tweets
                elif tweet_date < today_date:
                    tweets.append(result)
                    max_id = result["id_str"]

In [15]:
len(get_yesterday_tweets("skepticalsports"))

TwythonError: Twitter API returned a 503 (Service Unavailable), Over capacity

#### Save tweets as json file

In [39]:
for account in accounts:
    with open(account + "_" + str(datetime.utcnow().date()) + ".json","w") as outfile:
        json.dump(get_yesterday_tweets(account),outfile)