In [1]:
import pandas as pd
import requests
import os
import tweepy
import json


## 1. Gathering data

### 1.1 Read the manually downloaded twitter archive:

In [2]:
df = pd.read_csv('twitter-archive-enhanced.csv')
df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


### 1.2 Read the file with breed predictions from server using requests:

In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

In [4]:
# if we get a correct response
if(response.status_code == 200):
    
    # get the file name from URL
    file_path = url.split('/')[-1]
    
    # save file to disc
    with open(file_path, 'wb') as f:
        f.write(response.content)
        
    # check if the file is downloaded
    if(os.path.isfile(file_path)):
        print('File successfully downloaded.')

File successfully downloaded.


In [5]:
df_breeds = pd.read_csv('image-predictions.tsv', sep = '\t')
df_breeds.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


# TODO : 
## - Perform API requests and save to file.

### 1.3 Request additional tweet info using API

In [13]:
# gather information or print that the information already gathered and stored in a file (for review)
if(os.path.isfile('keys.txt')):
    
    # Read all the twitter access keys from file
    with open('keys.txt', 'r') as keys:
        consumer_key = keys.readline().strip()
        consumer_secret = keys.readline().strip()
        access_token = keys.readline().strip()
        access_token_secret = keys.readline().strip()

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)

    errors = 0 
    successes = 0
    error_statusids = []
    processed_count = 0  
    
    for status_id in df['tweet_id'].values:
        # print every 100th status and current info
        if(processed_count % 100 == 0):
            print('Status % 2d of 2356.... successes : % 2d, errors : % 2d' %(processed_count, successes, errors))
        with open('statuses.txt', 'w') as file:
            try:
                status = api.get_status(status_id, tweet_mode = 'extended')
                json.dump(status._json, file)
                successes += 1
            except:
                errors += 1
                error_statusids.append(status_id)
        processed_count += 1
    print("Gathering information completed. Successes : % 2d, Errors : % 2d" %(successes, errors))
    
elif(os.path.isfile('statuses.txt')):
    print('Additional information already gathered via API and stored in "statuses.txt" file')

Status  0 of 2356.... successes :  0, errors :  0
Status  100 of 2356.... successes :  98, errors :  2
Status  200 of 2356.... successes :  192, errors :  8
Status  300 of 2356.... successes :  287, errors :  13
Status  400 of 2356.... successes :  385, errors :  15
Status  500 of 2356.... successes :  485, errors :  15
Status  600 of 2356.... successes :  583, errors :  17


Rate limit reached. Sleeping for: 168


Status  700 of 2356.... successes :  683, errors :  17
Status  800 of 2356.... successes :  782, errors :  18
Status  900 of 2356.... successes :  881, errors :  19
Status  1000 of 2356.... successes :  980, errors :  20
Status  1100 of 2356.... successes :  1080, errors :  20
Status  1200 of 2356.... successes :  1180, errors :  20
Status  1300 of 2356.... successes :  1280, errors :  20
Status  1400 of 2356.... successes :  1380, errors :  20
Status  1500 of 2356.... successes :  1480, errors :  20


Rate limit reached. Sleeping for: 540


Status  1600 of 2356.... successes :  1580, errors :  20
Status  1700 of 2356.... successes :  1680, errors :  20
Status  1800 of 2356.... successes :  1779, errors :  21
Status  1900 of 2356.... successes :  1879, errors :  21
Status  2000 of 2356.... successes :  1979, errors :  21
Status  2100 of 2356.... successes :  2079, errors :  21
Status  2200 of 2356.... successes :  2179, errors :  21
Status  2300 of 2356.... successes :  2279, errors :  21
Gathering information completed. Successes :  2335, Errors :  21


In [70]:
df.shape

(2356, 17)

In [62]:
status = api.get_status(892420643555336193)
status._json

{'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'id': 892420643555336193,
 'id_str': '892420643555336193',
 'text': "This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU",
 'truncated': False,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [],
  'media': [{'id': 892420639486877696,
    'id_str': '892420639486877696',
    'indices': [86, 109],
    'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
    'url': 'https://t.co/MgUWQ76dJU',
    'display_url': 'pic.twitter.com/MgUWQ76dJU',
    'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'medium': {'w': 540, 'h': 528, 'resize': 'fit'},
     'small': {'w': 540, 'h': 528, 'resize': 'fit'},
     'large': {'w': 540, 'h': 528, 'resize': 'fit'}}}]},


In [65]:
with open('statuses.txt', 'r') as file:
    st = file.readline()

In [66]:
st

'{"created_at": "Sat Jul 29 16:00:24 +0000 2017", "id": 891327558926688256, "id_str": "891327558926688256", "text": "This is Franklin. He would like you to stop calling him \\"cute.\\" He is a very fierce shark and should be respected a\\u2026 https://t.co/0g0KMIVXZ3", "truncated": true, "entities": {"hashtags": [], "symbols": [], "user_mentions": [], "urls": [{"url": "https://t.co/0g0KMIVXZ3", "expanded_url": "https://twitter.com/i/web/status/891327558926688256", "display_url": "twitter.com/i/web/status/8\\u2026", "indices": [117, 140]}]}, "source": "<a href=\\"http://twitter.com/download/iphone\\" rel=\\"nofollow\\">Twitter for iPhone</a>", "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "in_reply_to_screen_name": null, "user": {"id": 4196983835, "id_str": "4196983835", "name": "WeRateDogs\\u2122 \\ud83c\\udff3\\ufe0f\\u200d\\ud83c\\udf08", "screen_name": "dog_rates", "location": "\\u300c DM YOUR DOGS \\u