In [1]:
import pandas as pd
import requests
import os
import tweepy
import json

# 1. Gather data

### 1.1 Read the manually downloaded twitter archive:

In [102]:
df_archive = pd.read_csv('twitter-archive-enhanced.csv')

### 1.2 Read the file with breed predictions from server using requests:

In [103]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

In [104]:
# if we get a correct response
if(response.status_code == 200):
    
    # get the file name from URL
    file_path = url.split('/')[-1]
    
    # save file to disc
    with open(file_path, 'wb') as f:
        f.write(response.content)
        
    # check if the file is downloaded
    if(os.path.isfile(file_path)):
        print('File successfully downloaded.')

File successfully downloaded.


In [105]:
df_breeds = pd.read_csv('image-predictions.tsv', sep = '\t')

### 1.3 Request additional tweet info using API

In [74]:
# gather information or print that the information already gathered and stored in a file (for review)
if(os.path.isfile('keys.txt')):
    
    # Read all the twitter access keys from file
    with open('keys.txt', 'r') as keys:
        consumer_key = keys.readline().strip()
        consumer_secret = keys.readline().strip()
        access_token = keys.readline().strip()
        access_token_secret = keys.readline().strip()

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)

    errors = 0 
    successes = 0
    processed_count = 0  
    error_statusids = []
    
    for status_id in df['tweet_id'].values:
        # print every 100th status and current info
        if(processed_count % 100 == 0):
            print('Status % 2d of 2356.... successes : % 2d, errors : % 2d' %(processed_count, successes, errors))

        try:
            status = api.get_status(status_id, tweet_mode = 'extended')
            with open('statuses.txt', 'a') as file:
                json.dump(status._json, file)
                file.write('\n')
            successes += 1
        except:
            errors += 1
            error_statusids.append(status_id)
        processed_count += 1
    print("Gathering information completed. Successes : % 2d, Errors : % 2d" %(successes, errors))
    
elif(os.path.isfile('statuses.txt')):
    print('Additional information already gathered via API and stored in "statuses.txt" file')

Additional information already gathered via API and stored in "statuses.txt" file


**Check some status that couldn't be processed**

In [80]:
# show all error tweets IDs
error_statusids

[888202515573088257,
 873697596434513921,
 872668790621863937,
 872261713294495745,
 869988702071779329,
 866816280283807744,
 861769973181624320,
 856602993587888130,
 851953902622658560,
 845459076796616705,
 844704788403113984,
 842892208864923648,
 837012587749474308,
 829374341691346946,
 827228250799742977,
 812747805718642688,
 802247111496568832,
 775096608509886464,
 770743923962707968,
 754011816964026368,
 680055455951884288]

In [93]:
# show urls for errored tweets
df[df['tweet_id'].isin(error_statusids)]['expanded_urls'].head()

19     <a href="http://twitter.com/download/iphone" r...
95     <a href="http://twitter.com/download/iphone" r...
101    <a href="http://twitter.com/download/iphone" r...
104    <a href="http://twitter.com/download/iphone" r...
118    <a href="http://twitter.com/download/iphone" r...
Name: source, dtype: object

In [95]:
# check one of the urls
df[df['tweet_id'].isin(error_statusids)]['expanded_urls'].iloc[19]

'https://twitter.com/dog_rates/status/754011816964026368/photo/1,https://twitter.com/dog_rates/status/754011816964026368/photo/1'

In [101]:
# request first url
url = df[df['tweet_id'].isin(error_statusids)]['expanded_urls'].iloc[19].split(',')[0]
response = requests.get(url)
response.status_code

404

Ok, this tweet was deleted. That's why we can't get the information anymore.

**Read collected data to dataframe**

In [76]:
# Read the file line by line and store statuses in the array of jsons

statuses = [json.loads(v.rstrip('\n')) for v in open('statuses.txt')]

# Create dataframe from list of dictionaries (jsons)

df_statuses = pd.DataFrame(statuses)
df_statuses.shape

(2337, 32)

For some reason, there are 2 more rows than successfully processed IDs. 

This could happen because of reaching the rate limit twice.

Need to check this DataFrame for duplicates during assess

**Gathering data completed. There are 3 DataFrames:**

- ```df_archive``` : information from manually downloaded archive
- ```df_breeds``` : information from tsv file, downloaded using HTTP request
- ```df_statuses``` : information acquired via Twitter API



# 2. Access data

In [60]:
statuses_df['id'].nunique()

2335

In [68]:
statuses_df[statuses_df.duplicated(['id'], keep=False)]

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,...,quoted_status,quoted_status_id,quoted_status_id_str,quoted_status_permalink,retweet_count,retweeted,retweeted_status,source,truncated,user
0,,,Tue Aug 01 16:23:56 +0000 2017,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892420639486877696, 'id_str'...",37454,False,This is Phineas. He's a mystical boy. Only eve...,,...,,,,,8147,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1,,,Tue Aug 01 00:17:27 +0000 2017,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892177413194625024, 'id_str'...",32188,False,This is Tilly. She's just checking pup on you....,,...,,,,,6033,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
2,,,Tue Aug 01 16:23:56 +0000 2017,"[0, 85]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892420639486877696, 'id_str'...",37454,False,This is Phineas. He's a mystical boy. Only eve...,,...,,,,,8147,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
3,,,Tue Aug 01 00:17:27 +0000 2017,"[0, 138]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 892177413194625024, 'id_str'...",32188,False,This is Tilly. She's just checking pup on you....,,...,,,,,6033,False,,"<a href=""http://twitter.com/download/iphone"" r...",False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
