# Wrangle and Analyze Data

## Table of Contents
- [Gather](#gather)
- [Assess](#assess)
- [Clean](#clean)

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import tweepy
import json
import config

<a id='gather'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Gather</h2>

In [2]:
df_twitter_archive_enhanced = pd.read_csv('twitter-archive-enhanced.csv')

In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(os.path.join(os.getcwd(),'image-predictions.tsv'), mode='wb') as file:
    file.write(response.content)

df_image_predictions = pd.read_csv('image-predictions.tsv', sep='\t')

In [28]:
auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
auth.set_access_token(config.access_token, config.access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [29]:
tweet_id_list = pd.concat( [df_image_predictions['tweet_id'], 
                          df_twitter_archive_enhanced['tweet_id']], ignore_index=True)

tweet_id_list = tweet_id_list.drop_duplicates()

In [30]:
# this function just to calculate the performance 
time_prv_step = time_step = None
def performance(count,iterations):
    progress = round((count/iterations)*100,2)
    print('processing {}%'.format(progress), end='\r')
    if(count%round(iterations/10) == 0): 
        global time_prv_step
        global time_step
        
        time_prv_step = pd.Timestamp.now() if count == 0 else time_step
        time_step = pd.Timestamp.now()
        
        duration = (time_step - time_prv_step).total_seconds()
        print('{} samples, during {}s'.format(count, round(duration,2) ))
        time_start = time_step

In [31]:
df_tweets = []
tweet_id_list_error = []
iterations = tweet_id_list.shape[0]
count = 0
for tweet_id in tweet_id_list:
    performance(count, iterations)
    count += 1
    try:
        df_tweets.append(api.get_status(tweet_id)._json)
    except Exception as e:
        print(str(tweet_id) + ": " + str(e))
        tweet_id_list_error.append(tweet_id)

0 samples, during 0.0s
236 samples, during 237.16s
472 samples, during 216.98s
680055455951884288: [{'code': 144, 'message': 'No status found with that ID.'}]
708 samples, during 209.0s
processing 38.2%%

Rate limit reached. Sleeping for: 77


944 samples, during 285.19s
1180 samples, during 212.78s
754011816964026368: [{'code': 144, 'message': 'No status found with that ID.'}]
1416 samples, during 214.6s
802247111496568832: [{'code': 144, 'message': 'No status found with that ID.'}]
1652 samples, during 201.68s
829374341691346946: [{'code': 144, 'message': 'No status found with that ID.'}]
processing 76.4%%

Rate limit reached. Sleeping for: 102


837012587749474308: [{'code': 144, 'message': 'No status found with that ID.'}]
837366284874571778: [{'code': 144, 'message': 'No status found with that ID.'}]
842892208864923648: [{'code': 144, 'message': 'No status found with that ID.'}]
844704788403113984: [{'code': 144, 'message': 'No status found with that ID.'}]
1888 samples, during 319.52s
851953902622658560: [{'code': 144, 'message': 'No status found with that ID.'}]
861769973181624320: [{'code': 144, 'message': 'No status found with that ID.'}]
872261713294495745: [{'code': 144, 'message': 'No status found with that ID.'}]
873697596434513921: [{'code': 144, 'message': 'No status found with that ID.'}]
888202515573088257: [{'code': 144, 'message': 'No status found with that ID.'}]
872668790621863937: [{'code': 144, 'message': 'No status found with that ID.'}]
869988702071779329: [{'code': 144, 'message': 'No status found with that ID.'}]
866816280283807744: [{'code': 144, 'message': 'No status found with that ID.'}]
85660299358

In [67]:
df_tweets = pd.DataFrame(df_tweets)
df_tweets.to_csv('tweets.csv', index=False)

> There are 22 tweets that no longer exist.

<a id='assess'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Assess</h2>

#### Quality
##### `name` table
- abc.

#### Tidiness
- abc.

<a id='clean'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Clean</h2>