# Wrangle and Analyze Data

### Import Libraries

In [355]:
# Import libraries
import pandas as pd
import numpy
import requests
import tweepy 
import json
import config
import numpy as np

### Import Data

In [179]:
### Import data ###

# Save csv file as df
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')

# Image prediction file to df
image_tsv = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
open('image_predictions.tsv', 'wb').write(image_tsv.content) # write to file
image_pred_df = pd.read_csv('image_predictions.tsv', sep = '\t')

# Connect to tweepy auth and instantiate API
auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_TOKEN_SECRET)

api = tweepy.API(auth)

In [180]:
twitter_archive.head() # view archive header

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [117]:
# Create a list to store tweet records
raw_data = []
not_found = [] # list to store IDs of tweets that could not get status on

for i in twitter_archive.tweet_id:
    temp_dict = {} # temporary dict
    try:
        status = api.get_status(i)
        temp_dict['tweet_id'] = i
        temp_dict['retweet_count'] = status._json['retweet_count']
        temp_dict['favorite_count'] = status._json['favorite_count']
        raw_data.append(temp_dict)
    except:
        not_found.append(i)

In [118]:
# write dict to json file
with open('tweet_json.txt', 'w') as json_file:
    json.dump(raw_data, json_file)


In [120]:
# json data to pandas df
tweets_df = pd.read_json('tweet_json.txt')
tweets_df.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,7416,35161
1,892177421306343426,5517,30426
2,891815181378084864,3641,22880
3,891689557279858688,7592,38449
4,891327558926688256,8166,36696


# Data Quality Issues

## Completeness

In [166]:
print("WeRateDogs Archive Data:\n",twitter_archive.isnull().sum().loc[lambda x: x>0].dropna(), '\n')
print("Image Predictions Data:\n", image_pred_df.isnull().sum().loc[lambda x: x>0].dropna(), '\n')
print("Tweet RT & Favorite Count Data:\n", tweets_df.isnull().sum().loc[lambda x: x>0].dropna())

WeRateDogs Archive Data:
 in_reply_to_status_id         2278
in_reply_to_user_id           2278
retweeted_status_id           2175
retweeted_status_user_id      2175
retweeted_status_timestamp    2175
expanded_urls                   59
dtype: int64 

Image Predictions Data:
 Series([], dtype: int64) 

Tweet RT & Favorite Count Data:
 Series([], dtype: int64)


The WeRateDogs Twitter archive data has 6 fields out of 17 total with missing values. The other data sets do not have missing values in their fields. 

## Validity

### Image Prediction Data

In [185]:
image_pred_df.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [183]:
# Image predictions df info
image_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


- `tweet_id` should be a string.

### Tweet RT Count & Favorite Data

In [162]:
# Tweet rt count and favorite df info
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1208 entries, 0 to 1207
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   tweet_id        1208 non-null   int64
 1   retweet_count   1208 non-null   int64
 2   favorite_count  1208 non-null   int64
dtypes: int64(3)
memory usage: 28.4 KB


- `tweet_id` should be a string.

In [191]:
twitter_archive.text[3]

'This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ'

- The text field is not entirely parsed to be exclusively test information as the image url is still included. 

### WeRateDogs Archive Tweet Archive Data

In [151]:
# Archive data info
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [389]:
twitter_archive.name = twitter_archive.name.map(lambda x: x.upper())
twitter_archive.name[twitter_archive.name.isin(['THE', 'A', 'AN'])]

56       A
649      A
759     AN
801      A
1002     A
        ..
2349    AN
2350     A
2352     A
2353     A
2354     A
Name: name, Length: 70, dtype: object

## Consistency

In [194]:
# Accessing third series element as example
print(twitter_archive.text[3], '\n')
print(twitter_archive.rating_numerator[3], twitter_archive.rating_denominator[3])

This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ 

13 10


- The `text` field contains dog rating information that is captured in other fields `rating_numerator` and `rating_denominator`. This presents an issue of consistency.

## Accuracy

Seeing as the `text` field from the archive data contained the dog ratings, I decided to extract the ratings in this field and compare them to the `rating_numerator` and `rating_denominator` fields to test for accuracy.

In [309]:
# Extract ratings from [text] field in twitter_archive using regex expressions
numerator_test = twitter_archive.text.str.extract(r'([0-9]{1,4})\/', expand=False) # create a series with extracted rating numerator from text
numerator_test = pd.to_numeric(numerator_test, errors='coerce').fillna(0).astype(np.int64)

denominator_test = twitter_archive.text.str.extract(r'\/([0-9]{1,3})', expand=False) # create series with extracted rating denominator from text
denominator_test = pd.to_numeric(denominator_test, errors='coerce').fillna(0).astype(np.int64)

# Create series with rating as decimal
ratings_test_series = pd.Series(numerator_test / denominator_test).replace(np.inf, 0)

The rating numerator and denominators are extracted from the `text` field and saved as separate Series. Then, I create a series that calculates the rating by dividing the extracted numerator by the extracted denominator. Many othe values are greater than 1 because the WeRateDogs page nearly always gives ratings greater than 10/10 because the animals are just so cute that they deserve 12/10--or even greater in some instances!

In [300]:
ratings_test_series.max() # extract max rating

66.6

The greatest rating extracted from the text was 66$ out of 10! Quite the prankster rating.  

In [308]:
print(twitter_archive.text[979])
print(twitter_archive.rating_numerator[979], '/' , twitter_archive.rating_denominator[979])

This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh
1776 / 10


In [296]:
# Create a series calculating rating as [rating_numerator] / [rating_denominator]
archive_rating = pd.Series(twitter_archive.rating_numerator / twitter_archive.rating_denominator).replace(np.inf, 0)

In [311]:
# Compare numerators
print(archive_rating.compare(ratings_test_series))
print("\nCount --> ", len(archive_rating.compare(ratings_test_series)))

Empty DataFrame
Columns: [self, other]
Index: []

Count -->  0


The dog ratings do appear to be successfully parsed from the tweet text since the comparison yields no differences between the two series. 

# Tidiness Issues

- The retweet and favorite counts are in a separate table than the tweet archive table.
- The `doggo`, `floofer`, `pupper`, and `puppo` fields represent an orginal ranking that should be one column. 

# Cleaning