In [1]:
import tweepy
import json
import time
import datetime
import os

In [2]:
# 180 request / 15 mins window

properties = dict(line.strip().split('=') 
          for line in open('jo.properties') 
          if not line.startswith('#') and not line.startswith('\n'))

auth = tweepy.OAuthHandler(properties["CONSUMER_KEY"], properties["CONSUMER_SECRET"])
auth.set_access_token(properties["OAUTH_TOKEN"], properties["OAUTH_TOKEN_SECRET"])
api = tweepy.API(auth)

In [3]:
QUERIES_DIR = 'queries/'


def save_tweets(query, tweets_container, directory=None):
    if tweets_container:
        current_time = datetime.datetime.now()
        query_dir = QUERIES_DIR + query + '/'
        if directory:
            query_dir += directory + '/'

        if not os.path.exists(query_dir):
            os.makedirs(query_dir)

        with open(query_dir + current_time.isoformat() + '.json', 'w') as f:
            json.dump(tweets_container, f)


def get_tweets(query, limit):
    tweets_container = {}
    
    # Code retrieved from
    # https://github.com/tweepy/tweepy/issues/197
    # I believe the count=100 is unessecary in our case
    #
    #
    # Cursor params are passed to the method (ex.api.search)
    # Cursor will alow as to continue to retrieve tweets without worrying about hitting a page limit
    #
    # api.search arguments are:
    #
    #     :reference: https://dev.twitter.com/rest/reference/get/search/tweets
    #     :allowed_param:'q', 'lang', 'locale', 'since_id', 'geocode',
    #      'max_id', 'since', 'until', 'result_type', 'count',
    #       'include_entities', 'from', 'to', 'source']
    #
    # To handle the rate limit:
    # # http://docs.tweepy.org/en/v3.5.0/code_snippet.html#handling-the-rate-limit-using-cursors

    def limit_handled(cursor, query):
        while True:
            try:
                yield cursor.next()

            #except tweepy.RateLimitError as e:
            # I think RateLimitError was depreceated, need to find the new one
            except StopIteration as e:
                print "StopIteration for %s" % query
                yield None
            except Exception as e:
                print "Rate Limit Error for %s: %s" % (query, e)
                # can handle in any way we want
                #time.sleep(15 * 60)
                yield None
    

    tweepy_cursor = tweepy.Cursor(api.search,
                                q=query,
                                count=100, # I think the correct param is rpp (but nvm it returns 100)
                                #result_type="recent", # We don't get enough results with this on
                                include_entities=True,
                                lang="en")   
    size = 0 
    for tweet in limit_handled(tweepy_cursor.items(), query):
        if not tweet:
            break
        # converting tweet to json
        tweet_json = json.dumps(tweet._json)
        tweet_json = json.loads(tweet_json)
        # To be able to retrieve the query later on
        tweet_json["from_query"] = query
        tweets_container[tweet_json['id']] = tweet_json

        # Test how many tweets we can store
        size = len(tweets_container.keys())
        if size >= limit:
            break
    
    print "Number of results: %d" % size
    return tweets_container

def get_and_save_tweets(since=None, until=None):
    # Example Query
    queries = ["#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton",   
               ]
    places = {"USA": "96683cc9126741d1", "Canada": "3376992a082d67c7", "France": "f3bfc7dcc928977f", "Denmark": "c29833e68a86e703",
             "Mexico": "25530ba03b7d90c6", "Brazil": "1b107df3ccc0aaa1", "Germany": "fdcd221ac44fa326", "China": "4797714c95971ac1",
             "UK": "6416b8512febefc9", "Russia": "5714382051c06d1e", "Panama": "9d8ae4b0fac2036a", "Australia": "3f14ce28dc7c4566",
             "Sweden": "82b141af443cb1b8", "India": "b850c1bfd38f30e0", "UAE": "3f63906fc8aa5a7d", "South Africa": "dd9c0d7d7e07eb49",
             }

    nb_queries_per_search_term = len(places) + 1
    nb_queries_total = len(queries) * nb_queries_per_search_term

    print nb_queries_total

    nb_results_per_query = 100

    print "Number of tweets we can at maximum have : %d" % (nb_queries_total * nb_results_per_query)

    limit = (180 / nb_queries_total) * nb_results_per_query
    # Somehow we are ratelimited before, so use a bias to lower the number of queries
    limit /= 1.5
    
    print "Expected number of queries : %d" % (nb_queries_total * (limit/nb_results_per_query))

    for original_query in queries:
        query = original_query
        directory = None
        
        if since and until:
            query += " since:" + since + " until:" + until
            directory = since + "-" + until
            
        print query 
        tweets_container = get_tweets(query, limit=limit)
            
        save_tweets(original_query, tweets_container, directory=directory)
        
        original_dir = directory
        
        for country, place_id in places.iteritems():
            print query, country
            query_place = "%s place:%s" % (query, place_id)
            tweets_container = get_tweets(query_place, limit=limit)
            
            if not tweets_container:
                print "No results for %s" % query_place
            else:
                if original_dir:
                    directory = original_dir + '/' + country
                else:
                    directory = country
                    
                save_tweets(original_query, tweets_container, directory=directory)

In [18]:
since = "2016-11-16"
until = "2016-11-17"
get_and_save_tweets(since=since, until=until)

# get_and_save_tweets()

51
5100
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2
Number of results: 300
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 Canada
Number of results: 300
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 Brazil
StopIteration for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 place:1b107df3ccc0aaa1
Number of results: 45
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 Australia
StopIteration for #DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 place:3f14ce28dc7c4566
Number of results: 229
#DonaldTrump OR #HillaryClinton OR Trump OR Clinton OR #Trump OR #Clinton since:2016-11-1 until:2016-11-2 USA
Number of results: 300
#DonaldTrump OR #HillaryCl

In [6]:
print [(tweet_id, content) for tweet_id, content in tweets_container.iteritems()][:2]

[(795576391647444992, {u'contributors': None, u'truncated': False, u'text': u'RT @OnlineMagazin: \U0001f198\u203c\ufe0f\U0001f602\U0001f525 Do not be fooled!!! Not even with free concert and celebrities like Jay-Z and Beyonce... #HillaryClinton can not f\u2026', u'is_quote_status': False, u'in_reply_to_status_id': None, u'id': 795576391647444992, u'favorite_count': 0, u'source': u'<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', u'retweeted': False, u'coordinates': None, u'entities': {u'symbols': [], u'user_mentions': [{u'indices': [3, 17], u'screen_name': u'OnlineMagazin', u'id': 113987669, u'name': u'Onlinemagazin', u'id_str': u'113987669'}], u'hashtags': [{u'indices': [114, 129], u'text': u'HillaryClinton'}], u'urls': []}, u'in_reply_to_screen_name': None, u'id_str': u'795576391647444992', u'retweet_count': 31, u'in_reply_to_user_id': None, u'favorited': False, u'retweeted_status': {u'contributors': None, u'truncated': True, u'text': u'\U0001f19

In [30]:
# Get place_id for a selection of countries
api = tweepy.API(auth)

countries = ["USA", "Canada", "France", "Denmark", "Mexico", "Brazil", "Germany", "India", "China", "South Africa", 
             "United Kingdom", "Russia", "Australia", "Cuba", "Sweden",
             "Argentina", "Japan", "Spain", "Italy", "Norway", "Finland", "UAE"] # Last line not taken (only first 15)

countries_place_id = []

# i = 0
# while i < len(countries):
#     country = countries[i]

for country in countries[:15]:
    try:
        # TODO Need to lang=en to this query to get the results in English, and not f***ing Danish 
        places = api.geo_search(query=country, granularity="country")
    except Exception as e:
        print "Rate Limit Error: %s" % e
        time.sleep(15 * 60 + 15)
        continue
        
    for place in places:
        place_id = place.id
        try:
            country_name = place.name.decode('utf-8')
        except Exception as e:
            country_name = place.name
        
        country_place_id = {"real_name": country, "twitter_name": country_name, "place_id": place_id}
        countries_place_id.append(country_place_id)
        
#     i += 1

with open('places_id.json', 'w') as f:
    json.dump(countries_place_id, f)

In [32]:
for country in countries_place_id:
    print country["twitter_name"]

USA
United States Minor Outlying Islands
Canada
Fransk Guyana
Frankrig
French Polynesia
French Southern and Antarctic Lands
Danmark
Mexico
Brasilien
Tyskland
Indien
British Indian Ocean Territory
中华人民共和国
South Africa
South Georgia and the South Sandwich Islands
Central African Republic
Storbritannien
USA
Forenede Arabiske Emirater
United States Minor Outlying Islands
المملكة الأردنية الهاشمية
Saudi Arabien
Hviderusland
Россия
Australia
Cuba
Sverige


In [35]:
current_time = datetime.datetime.now()

print current_time.isoformat()

2016-11-04T18:15:56.788111


In [20]:
places = api.geo_search(query="Panama", granularity="country")
print places

[Place(_api=<tweepy.api.API object at 0x7f7f259ad7d0>, country_code=u'PA', url=u'https://api.twitter.com/1.1/geo/id/9d8ae4b0fac2036a.json', country=u'Panama', place_type=u'country', bounding_box=BoundingBox(_api=<tweepy.api.API object at 0x7f7f259ad7d0>, type=u'Polygon', coordinates=[[[-83.051445, 7.20386], [-83.051445, 9.6394858], [-77.13939, 9.6394858], [-77.13939, 7.20386], [-83.051445, 7.20386]]]), contained_within=[], centroid=[-81.48476972454017, 8.41827865], full_name=u'Panama', attributes={}, id=u'9d8ae4b0fac2036a', name=u'Panama')]


In [1]:
with open('./queries/' + r"#Elections2016 OR #ElectionDay" + '/2016-11-1-2016-11-2/2016-11-08T20:42:37.580399.json', 'r') as f:
    d = json.loads(f.read())
#     for k,v in d.iteritems():
#     jsn = json.load(f)

IOError: [Errno 2] No such file or directory: './queries/#Elections2016 OR #ElectionDay/2016-11-1-2016-11-2/2016-11-08T20:42:37.580399.json'

dict