In [1]:
import tweepy
import json
import time
import datetime
import os

In [3]:
# 180 request / 15 mins window

properties = dict(line.strip().split('=') 
          for line in open('elections.properties') 
          if not line.startswith('#') and not line.startswith('\n'))

auth = tweepy.OAuthHandler(properties["CONSUMER_KEY"], properties["CONSUMER_SECRET"])
auth.set_access_token(properties["OAUTH_TOKEN"], properties["OAUTH_TOKEN_SECRET"])
api = tweepy.API(auth)

In [25]:
QUERIES_DIR = 'queries/'

def save_tweets(query, tweets_container, directory=None):
    current_time = datetime.datetime.now()
    query_dir = QUERIES_DIR + query + '/'
    if directory:
        query_dir += directory + '/'
    
    if not os.path.exists(query_dir):
        os.makedirs(query_dir)

    with open(query_dir + current_time.isoformat() + '.json', 'w') as f:
        json.dump(tweets_container, f)


def get_tweets(query, limit):
    tweets_container = {}
    
    # Code retrieved from
    # https://github.com/tweepy/tweepy/issues/197
    # I believe the count=100 is unessecary in our case
    #
    #
    # Cursor params are passed to the method (ex.api.search)
    # Cursor will alow as to continue to retrieve tweets without worrying about hitting a page limit
    #
    # api.search arguments are:
    #
    #     :reference: https://dev.twitter.com/rest/reference/get/search/tweets
    #     :allowed_param:'q', 'lang', 'locale', 'since_id', 'geocode',
    #      'max_id', 'since', 'until', 'result_type', 'count',
    #       'include_entities', 'from', 'to', 'source']
    #
    # To handle the rate limit:
    # # http://docs.tweepy.org/en/v3.5.0/code_snippet.html#handling-the-rate-limit-using-cursors

    def limit_handled(cursor, query):
        while True:
            try:
                yield cursor.next()

            #except tweepy.RateLimitError as e:
            # I think RateLimitError was depreceated, need to find the new one
            except Exception as e:
                print "Rate Limit Error for %s: %s" % (query, e)
                # can handle in any way we want
                #time.sleep(15 * 60)
                yield None


    tweepy_cursor = tweepy.Cursor(api.search,
                                q=query,
                                count=100,
                                result_type="recent",
                                include_entities=True,
                                lang="en")            

    for tweet in limit_handled(tweepy_cursor.items(), query):
        if not tweet:
            break
        # converting tweet to json
        tweet_json = json.dumps(tweet._json)
        tweet_json = json.loads(tweet_json)
        tweets_container[tweet_json['id']] = tweet_json

        # Test how many tweets we can store
        size = len(tweets_container.keys())
        if size % limit == 0:
            print size
            break
    
    return tweets_container

def get_and_save_tweets(since=None, until=None):
    # Example Query
    queries = ["#DonaldTrump OR #HillaryClinton OR Trump OR Clinton", "#Elections2016", "#ElectionDay"]
    places = {"USA": "96683cc9126741d1", "Canada": "3376992a082d67c7", "France": "f3bfc7dcc928977f", "Denmark": "c29833e68a86e703",
             "Mexico": "25530ba03b7d90c6", "Brazil": "1b107df3ccc0aaa1", "Germany": "fdcd221ac44fa326", "China": "4797714c95971ac1",
             "UK": "6416b8512febefc9", "Russia": "5714382051c06d1e", "Cuba": "ac26bceca6c10474", "Australia": "3f14ce28dc7c4566",
             "Sweden": "82b141af443cb1b8", "India": "b850c1bfd38f30e0", "UAE": "3f63906fc8aa5a7d", "South Africa": "dd9c0d7d7e07eb49",
             }

    nb_queries_per_search_term = len(places) + 1
    nb_queries_total = len(queries) * nb_queries_per_search_term

    print nb_queries_total

    nb_results_per_query = 100

    print nb_queries_total * nb_results_per_query

    limit = (180 / nb_queries_total) * nb_results_per_query

    for query in queries:
        if since and until:
            query += " since:" + since + " until:" + until
        print query
        tweets_container = get_tweets(query, limit=limit)
        save_tweets(query, tweets_container)
        
        for country, place_id in places.iteritems():
            print query, country
            tweets_container = get_tweets(query + " place:" + place_id, limit=limit)
            save_tweets(query, tweets_container, directory=country)

In [26]:
# since = "2016-11-1"
# until = "2016-11-2"

get_and_save_tweets()

51
5100
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton
300
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton Canada
Rate Limit Error for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton place:3376992a082d67c7: Twitter error response: status code = 429
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton Brazil
Rate Limit Error for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton place:1b107df3ccc0aaa1: Twitter error response: status code = 429
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton Australia
Rate Limit Error for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton place:3f14ce28dc7c4566: Twitter error response: status code = 429
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton USA
Rate Limit Error for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton place:96683cc9126741d1: Twitter error response: status code = 429
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton Mexico
Rate Limit Error for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton place:25530b

In [6]:
print [(tweet_id, content) for tweet_id, content in tweets_container.iteritems()][:2]

[(795576391647444992, {u'contributors': None, u'truncated': False, u'text': u'RT @OnlineMagazin: \U0001f198\u203c\ufe0f\U0001f602\U0001f525 Do not be fooled!!! Not even with free concert and celebrities like Jay-Z and Beyonce... #HillaryClinton can not f\u2026', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 795576391647444992, u'favorite_count': 0, u'source': u'<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', u'retweeted': False, u'coordinates': None, u'entities': {u'symbols': [], u'user_mentions': [{u'indices': [3, 17], u'screen_name': u'OnlineMagazin', u'id': 113987669, u'name': u'Onlinemagazin', u'id_str': u'113987669'}], u'hashtags': [{u'indices': [114, 129], u'text': u'HillaryClinton'}], u'urls': []}, u'in_reply_to_screen_name': None, u'id_str': u'795576391647444992', u'retweet_count': 31, u'in_reply_to_user_id': None, u'favorited': False, u'retweeted_status': {u'contributors': None, u'truncated': True, u'text': u'\U0001f19

In [30]:
# Get place_id for a selection of countries
api = tweepy.API(auth)

countries = ["USA", "Canada", "France", "Denmark", "Mexico", "Brazil", "Germany", "India", "China", "South Africa", 
             "United Kingdom", "Russia", "Australia", "Cuba", "Sweden",
             "Argentina", "Japan", "Spain", "Italy", "Norway", "Finland", "UAE"] # Last line not taken (only first 15)

countries_place_id = []

# i = 0
# while i < len(countries):
#     country = countries[i]

for country in countries[:15]:
    try:
        # TODO Need to lang=en to this query to get the results in English, and not f***ing Danish 
        places = api.geo_search(query=country, granularity="country", )
    except Exception as e:
        print "Rate Limit Error: %s" % e
        time.sleep(15 * 60 + 15)
        continue
        
    for place in places:
        place_id = place.id
        try:
            country_name = place.name.decode('utf-8')
        except Exception as e:
            country_name = place.name
        
        country_place_id = {"real_name": country, "twitter_name": country_name, "place_id": place_id}
        countries_place_id.append(country_place_id)
        
#     i += 1

with open('places_id.json', 'w') as f:
    json.dump(countries_place_id, f)

In [32]:
for country in countries_place_id:
    print country["twitter_name"]

USA
United States Minor Outlying Islands
Canada
Fransk Guyana
Frankrig
French Polynesia
French Southern and Antarctic Lands
Danmark
Mexico
Brasilien
Tyskland
Indien
British Indian Ocean Territory
中华人民共和国
South Africa
South Georgia and the South Sandwich Islands
Central African Republic
Storbritannien
USA
Forenede Arabiske Emirater
United States Minor Outlying Islands
المملكة الأردنية الهاشمية
Saudi Arabien
Hviderusland
Россия
Australia
Cuba
Sverige


In [35]:
current_time = datetime.datetime.now()

print current_time.isoformat()

2016-11-04T18:15:56.788111
