<a href="https://colab.research.google.com/github/bthodla/danano/blob/master/prj4/wrangle_act.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Gathering data

1.   Load the "twitter-archive-enhanced.csv" into a dataframe
2.   Load the "image_predictions.tsv" (the Tweet image predictions) hosted on Udacity servers programmatically using the "requests" library (URL: https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv)
3. Using tweet ids from (1) above and the Twitter API, load information such as "retweet count" and "favorite count" and any other interesting data items and store the entire set of data relating to a tweet in a JSON file called "tweet_json.txt"; then read this file line by line into a data frame and include tweet id, retweet count, favorite count and any other data items of interest
4. Consumer API keys: 
    
    Z22dLMPEOtCL2ayqnPTbFg3sK (API key)
    
    SuaxZay016BTIpzrYszo9Dbo0jQd38FZ8SUi0bDyy8hU6jMKdj (API secret key)
    
    Access token & access token secret

    14299634-DHvEoZI9bR2D5WOZWf82MjHX6vEnPRIosIrli6ueb (Access token)

    nvba5wXGFrEcYKmQjztu40wsr0vG2pXnQShE2ZQv6I313 (Access token secret)

    Read and write (Access level)




In [0]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import requests
import io
import tweepy
import json
from google.colab import drive
import yaml
import time
import os


In [0]:
def load_yaml_file(filename: str) -> dict:
    twitter_keys = {}

    with open(filename, 'r') as ymlfile:
        cfg = yaml.safe_load(ymlfile)
    
    twitter_keys['consumer_key'] = cfg['twitter']['consumer']['key']
    twitter_keys['consumer_secret'] = cfg['twitter']['consumer']['secret']
    twitter_keys['access_token'] = cfg['twitter']['access']['token']
    twitter_keys['access_token_secret'] = cfg['twitter']['access']['token-secret']

    return twitter_keys

In [0]:
def print_file(file_name, lines = 10):
    with open(file_name, 'r') as f:
        for _ in range(lines):
            print (f.readline())
        

In [4]:
drive.mount('/content/gdrive')
twitter_keys_yaml_file = '/content/gdrive/My Drive/Colab Notebooks/twitter_keys.yaml'
tweet_json_file = '/content/gdrive/My Drive/Colab Notebooks/tweet_json.txt'

"""
with open(twitter_keys_yaml_file, encoding='utf-8') as file:
    line = file.readline()
    while line:
        line = file.readline()
        print(line)
"""
print(load_yaml_file(twitter_keys_yaml_file))


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
{'consumer_key': 'Z22dLMPEOtCL2ayqnPTbFg3sK', 'consumer_secret': 'SuaxZay016BTIpzrYszo9Dbo0jQd38FZ8SUi0bDyy8hU6jMKdj', 'access_token': '14299634-DHvEoZI9bR2D5WOZWf82MjHX6vEnPRIosIrli6ueb', 'access_token_secret': 'nvba5wXGFrEcYKmQjztu40wsr0vG2pXnQShE2ZQv6I313'}


In [5]:
# 1. Load the file "twitter-archive-enhanced.csv" into a dataframe
twitter_archive_file_url = 'https://raw.githubusercontent.com/bthodla/danano/master/prj4/twitter-archive-enhanced.csv'

tweets_df = pd.read_csv(twitter_archive_file_url)

tweet_count = tweets_df.shape[0]
print ('%s %d' % ('Tweet Count: ', tweet_count))

Tweet Count:  2356


In [0]:
# 2. Load the "image_predictions.tsv" (the Tweet image predictions) hosted on Udacity servers programmatically using the "requests" library

image_predictions_file_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'

url_data = requests.get(image_predictions_file_url).content

img_pred_df = pd.read_csv(io.StringIO(url_data.decode('utf-8')), sep = '\t')


In [0]:
""" 3. Using tweet ids from tweet_df and the Twitter API, load information such as "retweet count" and "favorite count" and any other interesting data items 
and store the entire set of data relating to a tweet in a JSON file called "tweet_json.txt"; then read this file line by line into a data frame and include 
tweet id, retweet count, favorite count and any other data items of interest
"""

twitter_keys = load_yaml_file(twitter_keys_yaml_file)
consumer_key = twitter_keys['consumer_key']
consumer_secret = twitter_keys['consumer_secret']
access_token = twitter_keys['access_token']
access_token_secret = twitter_keys['access_token_secret']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

error_tweets = {}
tweet_offset = 0
tweet_batch = 100
tweets_remaining = 0

# Delete the file first so that each run can add content fresh
if os.path.isfile(tweet_json_file):
    os.remove(tweet_json_file)

with open(tweet_json_file, mode = 'a', encoding = 'utf-8') as f:
    while tweet_batch:
        tweet_ids = tweets_df[tweet_offset:tweet_offset + tweet_batch]['tweet_id']
        for tweet_id in tweet_ids:
            try:
                status = api.get_status(tweet_id)
                f.write(json.dumps(status._json) + '\n')

            except tweepy.TweepError as e:
                error_tweets[status.id] = e
            
        tweets_remaining = tweet_count - tweet_offset
        tweet_batch = tweet_batch if tweet_batch < tweets_remaining else tweets_remaining
        tweet_offset += tweet_batch
        # print ('%s %d %s %d' % ('Tweets Remaining: ', tweets_remaining, 'Tweet Offset: ', tweet_offset))
        # time.sleep(1 * 5)


Rate limit reached. Sleeping for: 765
Rate limit reached. Sleeping for: 768


In [0]:
with open(tweet_json_file, mode = 'r', encoding = 'utf-8') as f:
    print('%s %d %s %d' % ('Valid tweet count: ', len(f.readlines()), 'Error tweet count: ', len(error_tweets)))

In [0]:
# Read "tweet_json.txt" file line by line into a data frame and include tweet id, retweet count, favorite count and any other data items of interest

columns = ['tweet_id', 'text', 'retweet_count', 'favorite_count', 'retweeted', 'favorited', 'media_url']
tweets_mini_df = pd.DataFrame(columns = columns)

index = 0
with open(tweet_json_file, mode = 'r', encoding = 'utf-8') as f:
    for tweet in f:
        tweet_json = json.loads(tweet)
        tweet_id = tweet_json['id']
        text = tweet_json['text']
        retweet_count = tweet_json['retweet_count']
        favorite_count = tweet_json['favorite_count']
        retweeted = tweet_json['retweeted']
        favorited = tweet_json['favorited']
        try:
            media_url = tweet_json['entities']['media'][0]['media_url_https']
        except KeyError as e:
            media_url = ''

        tweets_mini_df.loc[index] = pd.Series({'tweet_id': tweet_id, 
                                        'text': text, 
                                        'retweet_count': retweet_count, 
                                        'favorite_count': favorite_count,
                                        'retweeted': retweeted,
                                        'favorited': favorited,
                                        'media_url': media_url})
        index +=1


### Assessing data

Key Points

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.

* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.

* Cleaning includes merging individual pieces of data according to the rules of tidy data.

* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This unique rating system is a big part of the popularity of WeRateDogs.

* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.


#### twitter-archive-enhanced.csv

In [8]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [9]:
tweets_df.query('name == "None"').sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
2202,668643542311546881,,,2015-11-23 04:13:37 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Fascinating dog here. Loves beach. Oddly long ...,,,,https://twitter.com/dog_rates/status/668643542...,3,10,,,,,
149,863079547188785154,6.671522e+17,4196984000.0,2017-05-12 17:12:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Ladies and gentlemen... I found Pipsy. He may ...,,,,https://twitter.com/dog_rates/status/863079547...,14,10,,,,,
1767,678396796259975168,,,2015-12-20 02:09:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",These little fellas have opposite facial expre...,,,,https://twitter.com/dog_rates/status/678396796...,12,10,,,,,
1837,676089483918516224,,,2015-12-13 17:21:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...","""Yes hello I'ma just snag this here toasted ba...",,,,https://twitter.com/dog_rates/status/676089483...,9,10,,,,,
189,855860136149123072,8.558585e+17,13615720.0,2017-04-22 19:05:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@s8n You tried very hard to portray this good ...,,,,,666,10,,,,,


In [10]:
tweets_df.isnull().sum()
tweets_df.query('name == "None"').shape

(745, 17)

#### image_predictions.tsv

In [11]:
img_pred_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [12]:
img_pred_df.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1451,776477788987613185,https://pbs.twimg.com/media/CsaaaaxWgAEfzM7.jpg,1,Labrador_retriever,0.884839,True,Chesapeake_Bay_retriever,0.057565,True,paintbrush,0.005766,False
1339,758474966123810816,https://pbs.twimg.com/media/Coak48zWAAAhBxV.jpg,1,Pembroke,0.546145,True,Cardigan,0.2442,True,German_shepherd,0.100429,True
11,666071193221509120,https://pbs.twimg.com/media/CT5cN_3WEAAlOoZ.jpg,1,Gordon_setter,0.503672,True,Yorkshire_terrier,0.174201,True,Pekinese,0.109454,True
1515,786709082849828864,https://pbs.twimg.com/media/CurzvFTXgAA2_AP.jpg,1,Pomeranian,0.467321,True,Persian_cat,0.122978,False,chow,0.102654,True
649,681891461017812993,https://pbs.twimg.com/media/CXaQqGbWMAAKEgN.jpg,1,Chihuahua,0.20357,True,doormat,0.134316,False,toy_terrier,0.084482,True


In [13]:
list(img_pred_df)

['tweet_id',
 'jpg_url',
 'img_num',
 'p1',
 'p1_conf',
 'p1_dog',
 'p2',
 'p2_conf',
 'p2_dog',
 'p3',
 'p3_conf',
 'p3_dog']

#### tweets_mini_df

In [14]:
tweets_mini_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2339 entries, 0 to 2338
Data columns (total 7 columns):
tweet_id          2339 non-null object
text              2339 non-null object
retweet_count     2339 non-null object
favorite_count    2339 non-null object
retweeted         2339 non-null object
favorited         2339 non-null object
media_url         2339 non-null object
dtypes: object(7)
memory usage: 146.2+ KB


In [16]:
tweets_mini_df.sample(5)

Unnamed: 0,tweet_id,text,retweet_count,favorite_count,retweeted,favorited,media_url
1664,682003177596559360,Unique dog here. Wrinkly as hell. Weird segmen...,1626,3276,False,False,https://pbs.twimg.com/media/CXb2RcDUsAEnkJb.jpg
897,757596066325864448,Here's another picture without a dog in it. Id...,1132,4558,False,False,https://pbs.twimg.com/media/CoOFmk3WEAAG6ql.jpg
1081,736225175608430592,We only rate dogs. Please stop sending in non-...,2923,8440,False,False,https://pbs.twimg.com/media/CjeY5DKXEAA3WkD.jpg
1743,678708137298427904,Here we are witnessing a wild field pupper. Lo...,2551,5753,False,False,
471,815390420867969024,Happy New Year from the squad! 13/10 for all h...,4123,10961,False,False,https://pbs.twimg.com/media/C1DZQiTXgAUqgRI.jpg


In [17]:
tweets_mini_df.isnull().sum()

tweet_id          0
text              0
retweet_count     0
favorite_count    0
retweeted         0
favorited         0
media_url         0
dtype: int64

#### Cleaning data

#### 1. twitter-archive-enhanced.csv
##### 1.1 Data Quality Issues:

1. There are 181 rows in this dataset that are retweets. I have identified them based non-null values in the "retweeted_status_id" column. These need to be removed from the dataset as we only want original tweets (1.1.1)

2. Remove the following columns from the dataset. I don't think that we need these columns for our analysis (1.1.2)

    *  in_reply_to_status_id
    *  in_reply_to_user_id
    *  retweeted_status_id
    *  retweeted_status_user_id
    *  retweeted_status_timestamp
    *  expanded_urls

3. The "timestamp" column is represented as a string and needs to be converted to datetime (1.1.3)
4. Drop all tweets with timestamps greater than 8/1/2017 as per the project requirements (1.1.4)
5. Dog stages are not fully recorded and many of them contain null values or values labeled as "None". We need to scan the tweet text and see if we can extract these values for each row (1.1.5)
6. *** Not Addressed *** There are 745 rows in this dataset where the name is marked as "None". I am not addressing this issue right now since I don't have sufficient data to obtain this information (1.1.6)

##### 1.2 Tidiness Issues:
1. Dog stages are represented in separate columns as a sparse-array matrix which is wasteful and also not scalable in case we identify new stages. These need to be converted to a single column titled "stage". If they are not mutually exclusive (meaning, if there are multiple tweets about the same dog at different points in time where their stages change), we can put them into a separate dataframe for normalization purposes (1.2.1)


#### 2. image-predictions.tsv

##### 2.1 Tidiness Issues:

1. In the case of this dataset, we will deal with the tidiness issues first before we deal with data quality issues. There is a specific reason for it - there are three breed predictions in this dataset with a percentage of confidence against each and a boolean flag indicating whether the image prediction is actually canine or not. That makes it a total of 9 columns. I would like to reduce them to three columns to begin with: breed, prediction_confidence, is_canine. Addressing this tidiness issue will make it easier to fix the data quality issues identified below. We will begin with separating breed predictions to its own dataframe so that we can work on the data quality issues later (2.1.1)

##### 2.2 Data Quality Issues:

1. Once we address the tidiness issue above and spin off the breed predictions to their own dataframe, we will eliminate all predictions where the "is_canine" flag is false. We will then pick the prediction with the higher percentage of confidence and retain them as the only breed predictions (assuming that each dog can be classified into a single breed) and drop the ones with lower prediction confidence scores (2.2.1)
2. We can then merge this dataset back to the original dataset (2.2.2)
3. After merging, we will drop the following columns as they are no longer needed (2.2.3): 
    *  jpg_url (we have obtained the media_url using the Twitter API and we will retain that in case it needs to be used)
    *  img_num (don't see how this information will be useful)
    *  p1
    *  p1_conf
    *  p1_dog
    *  p2
    *  p2_conf
    *  p2_dog
    *  p3
    *  p3_conf
    *  p3_dog




#### 3. tweet_mini_df

##### 3.1 Data Quality Issues
1. We need to drop all the tweets in this dataset that were originally marked as retweets in the "twitter-archive-enhanced" dataset as we are only interested in original tweets (3.1.1)


#### 4. Tidiness Issues: combining data from all the datasets

##### 4.1 Tidiness

1. Having addressed the data quality and tidiness issues at the level of individual datasets, we can now merge the data in these datasets into one. Since the data granularity in each of these datasets is at the "tweet_id" level, we will use it to combine the data into a single dataset and persist it to a file (4.1.1)

##### 4.2 Data Quality

1. We will use "inner joins" to merge these datasets thus eliminating any rows for whcih no valid information is available online. For example, there are 17 tweet ids in the original "twitter-archive-enhanced" dataset for which the Twitter API returned errors and these will be eliminated during the merging process. (4.2.1)

In [19]:
# First, I am making clean copies of each of the dataframes and leaving the originals intact

tweets_df_clean = tweets_df.copy()
img_pred_df_clean = img_pred_df.copy()
tweets_mini_df_clean = tweets_mini_df.copy()

print ('%s %d %s %d' %('Original Row Count: ', tweets_df.shape[0], 'Copy Row Count: ', tweets_df_clean.shape[0]))
print ('%s %d %s %d' %('Original Row Count: ', img_pred_df.shape[0], 'Copy Row Count: ', img_pred_df_clean.shape[0]))
print ('%s %d %s %d' %('Original Row Count: ', tweets_mini_df.shape[0], 'Copy Row Count: ', tweets_mini_df_clean.shape[0]))

Original Row Count:  2356 Copy Row Count:  2356
Original Row Count:  2075 Copy Row Count:  2075
Original Row Count:  2339 Copy Row Count:  2339


In [20]:
"""
1.1.1 Define (twitter-archive-enhanced.csv)
There are 181 rows in this dataset that are retweets. I have identified them based non-null values in the "retweeted_status_id" column. 
These need to be removed from the dataset as we only want original tweets
"""

tweets_df_clean[tweets_df_clean['retweeted_status_id'].notnull()].shape[0]

181

In [0]:
"""
1.1.1 Code
"""

tweets_df_clean.drop(tweets_df_clean[tweets_df_clean['retweeted_status_id'].notnull()].index, inplace = True)


In [0]:
"""
1.1.1 Test
Assertion: Original DF Row Count - Cleaned DF Row Count = Retweet Count
"""

assert (tweets_df.shape[0] - tweets_df_clean.shape[0] == tweets_df[tweets_df['retweeted_status_id'].notnull()].shape[0])
# print('%s %d %s %d %s %d' % ('Org. Count: ', tweets_df.shape[0], 'New Count: ', tweets_df_clean.shape[0], 'Diff: ',  tweets_df[tweets_df['retweeted_status_id'].notnull()].shape[0]))

In [23]:
"""
1.1.2 Define (twitter-archive-enhanced.csv)
Remove the following columns from the dataset:

in_reply_to_status_id
in_reply_to_user_id
retweeted_status_id
retweeted_status_user_id
retweeted_status_timestamp
expanded_urls
"""
tweets_df_clean_drop_cols = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp', 'expanded_urls']
tweets_df_clean[tweets_df_clean_drop_cols].head()

Unnamed: 0,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls
0,,,,,,https://twitter.com/dog_rates/status/892420643...
1,,,,,,https://twitter.com/dog_rates/status/892177421...
2,,,,,,https://twitter.com/dog_rates/status/891815181...
3,,,,,,https://twitter.com/dog_rates/status/891689557...
4,,,,,,https://twitter.com/dog_rates/status/891327558...


In [0]:
"""
1.1.2 Code
"""
tweets_df_clean.drop(tweets_df_clean_drop_cols, axis = 1, inplace = True)

In [0]:
"""
1.1.2 Test
"""

assert not set(tweets_df_clean_drop_cols).issubset(tweets_df_clean.columns)

In [26]:
""" 
1.1.3 Define (twitter-archive-enhanced.csv)
The "timestamp" column is represented as a string and needs to be converted to datetime
"""

tweets_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 11 columns):
tweet_id              2175 non-null int64
timestamp             2175 non-null object
source                2175 non-null object
text                  2175 non-null object
rating_numerator      2175 non-null int64
rating_denominator    2175 non-null int64
name                  2175 non-null object
doggo                 2175 non-null object
floofer               2175 non-null object
pupper                2175 non-null object
puppo                 2175 non-null object
dtypes: int64(3), object(8)
memory usage: 203.9+ KB


In [0]:
"""
1.1.3 Code
"""

tweets_df_clean['timestamp'] = pd.to_datetime(tweets_df_clean['timestamp'])

In [28]:
"""
1.1.3 Test
"""

tweets_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 11 columns):
tweet_id              2175 non-null int64
timestamp             2175 non-null datetime64[ns]
source                2175 non-null object
text                  2175 non-null object
rating_numerator      2175 non-null int64
rating_denominator    2175 non-null int64
name                  2175 non-null object
doggo                 2175 non-null object
floofer               2175 non-null object
pupper                2175 non-null object
puppo                 2175 non-null object
dtypes: datetime64[ns](1), int64(3), object(7)
memory usage: 203.9+ KB


In [29]:
"""
1.1.4 Define (twitter-archive-enhanced.csv)
Drop all tweets with timestamps greater than 8/1/2017 as per the project requirements
"""
tweets_df_clean.query('timestamp > "08/01/2017"')

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,,,,
1,892177421306343426,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,13,10,Tilly,,,,


In [0]:
"""
1.1.4 Code
"""

tweets_df_clean.drop(tweets_df_clean.query('timestamp > "08/01/2017"').index, inplace = True)

In [0]:
"""
1.1.4 Test
"""

assert tweets_df_clean.query('timestamp > "08/01/2017"').shape[0] == 0

In [32]:
"""
1.1.5 Define (twitter-archive-enhanced.csv)
Dog stages are not fully recorded and many of them contain null values or values labeled as "None". 
We need to scan the tweet text and see if we can extract these values for each row
"""

dog_stages = ['doggo', 'floofer', 'pupper', 'puppo']
[tweets_df_clean[dog_stages] == 'None']

[      doggo  floofer  pupper  puppo
 2      True     True    True   True
 3      True     True    True   True
 4      True     True    True   True
 5      True     True    True   True
 6      True     True    True   True
 7      True     True    True   True
 8      True     True    True   True
 9     False     True    True   True
 10     True     True    True   True
 11     True     True    True   True
 12     True     True    True  False
 13     True     True    True   True
 14     True     True    True  False
 15     True     True    True   True
 16     True     True    True   True
 17     True     True    True   True
 18     True     True    True   True
 20     True     True    True   True
 21     True     True    True   True
 22     True     True    True   True
 23     True     True    True   True
 24     True     True    True   True
 25     True     True    True   True
 26     True     True    True   True
 27     True     True    True   True
 28     True     True    True   True
 

In [0]:
"""
1.1.5 Code
In this cleaning exercise, I am going to transpose the the "dog_stage columns" to rows first to make the next steps easier
"""

dog_stage_df = pd.melt(tweets_df_clean[['tweet_id', 'text', 'doggo', 'floofer', 'pupper', 'puppo']], id_vars = ['tweet_id', 'text'], var_name = 'dog_stage_col', value_name = 'dog_stage')

# Next, I am assigning the value "doggo" to the column 'dog_stage' in cases where the tweet text has the word "doggo" in it and the 'dog_stage' column has the value "None"
dog_stage_df.loc[dog_stage_df[dog_stage_df['text'].str.contains('doggo ', case = False)].query('dog_stage_col == "doggo" and dog_stage == "None"').index, 'dog_stage'] = 'doggo'

# Next, I am repeating the above for other 'dog_stage' values
dog_stage_df.loc[dog_stage_df[dog_stage_df['text'].str.contains('floofer ', case = False)].query('dog_stage_col == "floofer" and dog_stage == "None"').index, 'dog_stage'] = 'floofer'
dog_stage_df.loc[dog_stage_df[dog_stage_df['text'].str.contains('pupper ', case = False)].query('dog_stage_col == "pupper" and dog_stage == "None"').index, 'dog_stage'] = 'pupper'
dog_stage_df.loc[dog_stage_df[dog_stage_df['text'].str.contains('puppo ', case = False)].query('dog_stage_col == "puppo" and dog_stage == "None"').index, 'dog_stage'] = 'puppo'


In [34]:
"""
The test below confirm that there are multiple dog stages tied to a single tweet id.
This seems to indicate that a dog can go through multiple stages.
Therefore, i am going to retain this as a separate dataset
However, I am going to get rid of all rows where the dog stage is None even after fixing the quality
"""
dog_stage_df[['tweet_id', 'dog_stage']].query('dog_stage != "None"').groupby('tweet_id').filter(lambda x: len(x) > 1).sort_values(by=['tweet_id'])

Unnamed: 0,tweet_id,dog_stage
933,733109485275860992,doggo
5279,733109485275860992,pupper
883,741067306818797568,doggo
5229,741067306818797568,pupper
779,751583847268179968,doggo
5125,751583847268179968,pupper
720,759793422261743616,doggo
5066,759793422261743616,pupper
591,781308096455073793,doggo
4937,781308096455073793,pupper


In [35]:
"""
We will begin with dropping the unnecessary columns
"""
dog_stage_df.drop(['text', 'dog_stage_col'], axis = 1, inplace = True)
dog_stage_df.info()

"""
Next, we will drop all rows where the dog_stage column contains the value "None"
"""

dog_stage_df.drop(dog_stage_df.query('dog_stage == "None"').index, inplace = True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8692 entries, 0 to 8691
Data columns (total 2 columns):
tweet_id     8692 non-null int64
dog_stage    8692 non-null object
dtypes: int64(1), object(1)
memory usage: 135.9+ KB


In [0]:
"""
1.1.5 Test
"""

assert dog_stage_df.query('dog_stage == "None"').shape[0] == 0

In [37]:
"""
2.1.1 Define (image-predictions.tsv)
In the case of this dataset, we will deal with the tidiness issues first before we deal with data quality issues. 
There is a specific reason for it - there are three breed predictions in this dataset with a percentage of confidence 
against each and a boolean flag indicating whether the image prediction is actually canine or not. 
That makes it a total of 9 columns. I would like to reduce them to three columns to begin with: breed, prediction_confidence, is_canine. 
Addressing this tidiness issue will make it easier to fix the data quality issues identified below. 
We will begin with separating breed predictions to its own dataframe so that we can work on the data quality issues later
"""

img_pred_df.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
899,699788877217865730,https://pbs.twimg.com/media/CbYmRHyWEAASNzm.jpg,1,Border_terrier,0.35506,True,toy_poodle,0.169736,True,Norwegian_elkhound,0.099884,True
289,671163268581498880,https://pbs.twimg.com/media/CVBzbWsWsAEyNMA.jpg,1,African_hunting_dog,0.733025,False,plow,0.119377,False,Scottish_deerhound,0.026983,True
1757,825147591692263424,https://pbs.twimg.com/media/C3ODWpfXAAAP1fb.jpg,1,Pekinese,0.354823,True,Pomeranian,0.24539,True,toy_poodle,0.136545,True
1816,833863086058651648,https://pbs.twimg.com/media/C5J6DIpWQAEosSz.jpg,1,kuvasz,0.494969,True,Great_Pyrenees,0.312632,True,golden_retriever,0.141736,True
511,676191832485810177,https://pbs.twimg.com/media/CWJQ4UmWoAIJ29t.jpg,2,Chihuahua,0.376741,True,Italian_greyhound,0.173114,True,muzzle,0.071485,False


In [0]:
"""
2.1.1 Code
"""

img_pred_df_p1 = img_pred_df[['tweet_id', 'p1', 'p1_conf', 'p1_dog']].copy()
img_pred_df_p1.columns = ['tweet_id', 'breed', 'prediction_confidence', 'is_canine']

img_pred_df_p2 = img_pred_df[['tweet_id', 'p2', 'p2_conf', 'p2_dog']].copy()
img_pred_df_p2.columns = ['tweet_id', 'breed', 'prediction_confidence', 'is_canine']

img_pred_df_p3 = img_pred_df[['tweet_id', 'p3', 'p3_conf', 'p3_dog']].copy()
img_pred_df_p3.columns = ['tweet_id', 'breed', 'prediction_confidence', 'is_canine']

img_pred_df_breed = img_pred_df_p1.append(img_pred_df_p2.append(img_pred_df_p3)).copy()

img_pred_df_breed = img_pred_df_breed.query('is_canine')


In [0]:
"""
2.1.1 Test
"""

assert img_pred_df_breed.query('not is_canine').shape[0] == 0