# Wrangle and Analyze Data

## Table of Contents
- [Gather](#gather)
- [Assess](#assess)
- [Clean](#clean)

In [1]:
import pandas as pd
import numpy as np
import requests
import os
import tweepy
import json
import config
import matplotlib.pyplot as plt
%matplotlib inline

<a id='gather'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Gather</h2>

In [229]:
# SOURCE 1: reading the data "archive".

df_archive = pd.read_csv('twitter-archive-enhanced.csv')

In [230]:
# SOURCE 2: downloading, writeing & reading the data "predictions".

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

with open(os.path.join(os.getcwd(),'image-predictions.tsv'), mode='wb') as file:
    file.write(response.content)

df_predictions = pd.read_csv('image-predictions.tsv', sep='\t')

In [231]:
# SOURCE 3: retrieving the data from twitter via APIs.

auth = tweepy.OAuthHandler(config.consumer_key, config.consumer_secret)
auth.set_access_token(config.access_token, config.access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [232]:
tweet_id_list = pd.concat( [df_predictions['tweet_id'], 
                          df_archive['tweet_id']], ignore_index=True)

tweet_id_list = tweet_id_list.drop_duplicates()

In [233]:
# this function just to calculate the performance 
time_prv_step = time_step = None
def performance(count,iterations):
    progress = round((count/iterations)*100,2)
    print('processing {}%'.format(progress), end='\r')
    if(count%round(iterations/10) == 0): 
        global time_prv_step
        global time_step
        
        time_prv_step = pd.Timestamp.now() if count == 0 else time_step
        time_step = pd.Timestamp.now()
        
        duration = (time_step - time_prv_step).total_seconds()
        print('{} samples, during {}s'.format(count, round(duration,2) ))
        time_start = time_step

In [7]:
tweets = []
tweet_id_list_error = []
iterations = tweet_id_list.shape[0]
count = 0
for tweet_id in tweet_id_list:
    performance(count, iterations)
    count += 1
    try:
        tweets.append(api.get_status(tweet_id)._json)
    except Exception as e:
        print(str(tweet_id) + ": " + str(e))
        tweet_id_list_error.append(tweet_id)

# df_tweets = pd.DataFrame(df_tweets)
# df_tweets.to_csv('tweet_json.txt', index=False)

0 samples, during 0.0s
236 samples, during 222.07s
472 samples, during 219.68s
680055455951884288: [{'code': 144, 'message': 'No status found with that ID.'}]
708 samples, during 242.47s
944 samples, during 266.55s
1180 samples, during 264.89s
754011816964026368: [{'code': 144, 'message': 'No status found with that ID.'}]
1416 samples, during 261.54s
802247111496568832: [{'code': 144, 'message': 'No status found with that ID.'}]
1652 samples, during 242.69s
829374341691346946: [{'code': 144, 'message': 'No status found with that ID.'}]
837012587749474308: [{'code': 144, 'message': 'No status found with that ID.'}]
837366284874571778: [{'code': 144, 'message': 'No status found with that ID.'}]
842892208864923648: [{'code': 144, 'message': 'No status found with that ID.'}]
844704788403113984: [{'code': 144, 'message': 'No status found with that ID.'}]
1888 samples, during 243.05s
851953902622658560: [{'code': 144, 'message': 'No status found with that ID.'}]
861769973181624320: [{'code':

NameError: name 'df_tweets' is not defined

In [7]:
tweets = pd.DataFrame(tweets)
tweets.to_json('tweet_json.txt', orient='records')

NameError: name 'tweets' is not defined

In [234]:
df_tweets = pd.read_json('tweet_json.txt')

> There are 22 tweets that no longer exist.

<a id='assess'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Assess</h2>

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">twitter_archive_enhanced table</h3>

In [235]:
df_archive.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
884,760190180481531904,,,2016-08-01 19:07:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sadie. She's addicted to balloons. It's t...,,,,https://twitter.com/dog_rates/status/760190180...,10,10,Sadie,,,,
562,802600418706604034,,,2016-11-26 19:50:26 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",This is Bailey. She has mastered the head tilt...,,,,https://vine.co/v/5FwUWjYaW0Y,11,10,Bailey,,,,
898,758854675097526272,,,2016-07-29 02:40:28 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Lilli Bee &amp; Honey Bear. Unfortunat...,,,,https://twitter.com/dog_rates/status/758854675...,11,10,Lilli,,,,
1540,689659372465688576,,,2016-01-20 04:03:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Ricky. He's being escorted out of the ...,,,,https://twitter.com/dog_rates/status/689659372...,8,10,Ricky,,,,
1520,690728923253055490,,,2016-01-23 02:53:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Vinscent. He was just questioned about...,,,,https://twitter.com/dog_rates/status/690728923...,8,10,Vinscent,,,,
1158,724049859469295616,,,2016-04-24 01:38:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kreggory. He just took a look at his s...,,,,https://twitter.com/dog_rates/status/724049859...,10,10,Kreggory,,,,
1242,711998809858043904,,,2016-03-21 19:31:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @twitter: @dog_rates Awesome Tweet! 12/10. ...,7.119983e+17,783214.0,2016-03-21 19:29:52 +0000,https://twitter.com/twitter/status/71199827977...,12,10,,,,,
262,842765311967449089,,,2017-03-17 15:51:22 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Indie. She's not a fan of baths but she's...,,,,"https://www.gofundme.com/get-indie-home/,https...",12,10,Indie,,,,
2032,671763349865160704,,,2015-12-01 18:50:38 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Mark. He's a good dog. Always rea...,,,,https://twitter.com/dog_rates/status/671763349...,9,10,Mark,,,,
2318,666454714377183233,,,2015-11-17 03:16:00 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I'll name the dogs from now on. This is Kreggo...,,,,https://twitter.com/dog_rates/status/666454714...,10,10,Kreggory,,,,


In [236]:
df_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


In [237]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [238]:
# this function to explore counts of categorical values in each column of provided df.
def categorical_scan(df):
    for col in df.columns:
        print(df[col].value_counts().count(), '\t', col)

In [239]:
categorical_scan(df_archive)

2356 	 tweet_id
77 	 in_reply_to_status_id
31 	 in_reply_to_user_id
2356 	 timestamp
4 	 source
2356 	 text
181 	 retweeted_status_id
25 	 retweeted_status_user_id
181 	 retweeted_status_timestamp
2218 	 expanded_urls
40 	 rating_numerator
18 	 rating_denominator
957 	 name
2 	 doggo
2 	 floofer
2 	 pupper
2 	 puppo


In [240]:
df_archive.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [241]:
df_archive.text.str.contains(r'\d+\/\d+').value_counts()

True    2356
Name: text, dtype: int64

In [242]:
df_archive.text.str.extract(r'(?P<text>\d+\/\d+)').text.value_counts()

12/10      558
11/10      463
10/10      461
13/10      351
9/10       156
8/10       102
14/10       54
7/10        53
5/10        37
6/10        32
3/10        19
4/10        15
2/10         9
1/10         8
420/10       2
75/10        2
4/20         2
15/10        2
0/10         2
9/11         2
84/70        1
17/10        1
50/50        1
11/15        1
007/10       1
165/150      1
1776/10      1
143/130      1
44/40        1
27/10        1
99/90        1
60/50        1
7/11         1
80/80        1
45/50        1
24/7         1
144/120      1
182/10       1
88/80        1
26/10        1
204/170      1
1/2          1
121/110      1
666/10       1
20/16        1
960/00       1
Name: text, dtype: int64

In [243]:
# retweets in tweets
df_archive[
    df_archive.retweeted_status_id.isin(df_archive.tweet_id)
].shape[0]


156

In [244]:
# replies in tweets
df_archive[
    df_archive.in_reply_to_status_id.isin(df_archive.tweet_id)
].shape[0]

44

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">image_predictions table</h3>

In [245]:
df_predictions.sample(20)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1538,790723298204217344,https://pbs.twimg.com/media/CvaYgDOWgAEfjls.jpg,1,tub,0.479477,False,bathtub,0.325106,False,golden_retriever,0.07853,True
1408,770293558247038976,https://pbs.twimg.com/media/CrCh5RgW8AAXW4U.jpg,1,Italian_greyhound,0.931668,True,Mexican_hairless,0.038896,True,whippet,0.013151,True
591,679158373988876288,https://pbs.twimg.com/media/CWza7kpWcAAdYLc.jpg,1,pug,0.272205,True,bull_mastiff,0.25153,True,bath_towel,0.116806,False
1185,738885046782832640,https://pbs.twimg.com/media/CkEMBz9WYAAGLaa.jpg,1,bath_towel,0.87832,False,swab,0.020633,False,American_Staffordshire_terrier,0.015535,True
1352,759923798737051648,https://pbs.twimg.com/media/CovKqSYVIAAUbUW.jpg,1,Labrador_retriever,0.324579,True,seat_belt,0.109168,False,pug,0.102466,True
594,679503373272485890,https://pbs.twimg.com/media/CW4UtmYWsAAEjqA.jpg,1,porcupine,0.999846,False,meerkat,7.2e-05,False,echidna,4.4e-05,False
500,675845657354215424,https://pbs.twimg.com/media/CWEWClfW4AAnqhG.jpg,1,pug,0.883952,True,Boston_bull,0.011057,True,French_bulldog,0.00984,True
167,668986018524233728,https://pbs.twimg.com/media/CUi3PIrWoAAPvPT.jpg,1,doormat,0.976103,False,Chihuahua,0.00564,True,Norfolk_terrier,0.003913,True
1095,720043174954147842,https://pbs.twimg.com/media/Cf4bcm8XEAAX4xV.jpg,1,Samoyed,0.954517,True,Eskimo_dog,0.02913,True,white_wolf,0.004462,False
304,671518598289059840,https://pbs.twimg.com/media/CVG2l9jUYAAwg-w.jpg,1,Lakeland_terrier,0.428275,True,wire-haired_fox_terrier,0.111472,True,toy_poodle,0.105016,True


In [246]:
df_predictions.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [247]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [248]:
categorical_scan(df_predictions)

2075 	 tweet_id
2009 	 jpg_url
4 	 img_num
378 	 p1
2006 	 p1_conf
2 	 p1_dog
405 	 p2
2004 	 p2_conf
2 	 p2_dog
408 	 p3
2006 	 p3_conf
2 	 p3_dog


In [249]:
df_predictions.p1.value_counts()

golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
cocker_spaniel                30
malamute                      30
French_bulldog                26
Chesapeake_Bay_retriever      23
miniature_pinscher            23
seat_belt                     22
Staffordshire_bullterrier     20
German_shepherd               20
Siberian_husky                20
Cardigan                      19
web_site                      19
beagle                        18
Shetland_sheepdog             18
Maltese_dog                   18
Eskimo_dog                    18
teddy                         18
Shih-Tzu                      17
Rottweiler                    17
Lakeland_terrier              17
Italian_greyhound             16
kuvasz                        16
          

In [250]:
df_predictions.p2.value_counts()

Labrador_retriever                104
golden_retriever                   92
Cardigan                           73
Chihuahua                          44
Pomeranian                         42
French_bulldog                     41
Chesapeake_Bay_retriever           41
toy_poodle                         37
cocker_spaniel                     34
miniature_poodle                   33
Siberian_husky                     33
beagle                             28
Pembroke                           27
collie                             27
Eskimo_dog                         27
kuvasz                             26
Italian_greyhound                  22
Pekinese                           21
American_Staffordshire_terrier     21
malinois                           20
toy_terrier                        20
chow                               20
miniature_pinscher                 20
Samoyed                            20
Boston_bull                        19
Norwegian_elkhound                 19
Staffordshir

In [251]:
df_predictions.p3.value_counts()

Labrador_retriever                79
Chihuahua                         58
golden_retriever                  48
Eskimo_dog                        38
kelpie                            35
kuvasz                            34
chow                              32
Staffordshire_bullterrier         32
beagle                            31
cocker_spaniel                    31
Pekinese                          29
toy_poodle                        29
Pomeranian                        29
Pembroke                          27
Great_Pyrenees                    27
Chesapeake_Bay_retriever          27
malamute                          26
French_bulldog                    26
American_Staffordshire_terrier    24
Cardigan                          23
pug                               23
basenji                           21
toy_terrier                       20
bull_mastiff                      20
Siberian_husky                    19
Shetland_sheepdog                 17
Boston_bull                       17
b

In [252]:
df_predictions.query('p1_dog == False & p2_dog == False & p3_dog == False').shape[0]

324

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">tweets table</h3>

In [253]:
df_tweets.sample(10)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
1989,,,2017-06-18 16:57:37,"{'hashtags': [], 'symbols': [], 'user_mentions...",,18174,False,,876484053909872640,876484053909872640,...,,,,2290,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Benedict. He wants to thank you for th...,True,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1725,,,2017-01-15 17:52:40,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 820690162338279425, 'id_str'...",12743,False,,820690176645140481,820690176645140480,...,,,,3418,False,,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
167,,,2015-11-24 02:54:30,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 668986015319760896, 'id_str'...",549,False,,668986018524233728,668986018524233728,...,,,,171,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Ruby. She's a Bimmington Fettuccini. O...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
2062,,,2017-07-15 16:51:35,"{'hashtags': [], 'symbols': [], 'user_mentions...",,116,False,,886267009285017600,886267009285017600,...,,,,4,False,,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1121,,,2016-05-02 00:43:25,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 726935081290452993, 'id_str'...",7069,False,,726935089318363137,726935089318363136,...,,,,2538,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Sprout. He's just precious af. 12/10 I...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
576,,,2015-12-20 20:38:24,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 678675831426797568, 'id_str'...",2926,False,,678675843183484930,678675843183484928,...,,,,1526,False,,"<a href=""http://twitter.com/download/iphone"" r...",Exotic pup here. Tail long af. Throat looks sw...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1067,,,2016-04-01 15:46:52,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 715928416713895936, 'id_str'...",3271,False,,715928423106027520,715928423106027520,...,,,,907,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Bubbles. He's a Yorkshire Piccolope. 1...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1387,,,2016-08-18 23:55:18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 766423252247994368, 'id_str'...",6275,False,,766423258543644672,766423258543644672,...,,,,1676,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Shadoe. Her tongue flies out of her mo...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
477,,,2015-12-11 04:14:49,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 675166815044173826, 'id_str'...",3639,False,,675166823650848770,675166823650848768,...,,,,1637,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Arnold. He broke his leg saving a hand...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."
1064,,,2016-03-31 23:22:53,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'media': [{'id': 715680780459098112, 'id_str'...",4432,False,,715680795826982913,715680795826982912,...,,,,1675,False,,"<a href=""http://twitter.com/download/iphone"" r...",This is Zeus. He's downright fabulous. 12/10 h...,False,"{'id': 4196983835, 'id_str': '4196983835', 'na..."


In [254]:
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 30 columns):
contributors                     0 non-null float64
coordinates                      0 non-null float64
created_at                       2334 non-null datetime64[ns]
entities                         2334 non-null object
extended_entities                1819 non-null object
favorite_count                   2334 non-null int64
favorited                        2334 non-null bool
geo                              0 non-null float64
id                               2334 non-null int64
id_str                           2334 non-null int64
in_reply_to_screen_name          77 non-null object
in_reply_to_status_id            77 non-null float64
in_reply_to_status_id_str        77 non-null float64
in_reply_to_user_id              77 non-null float64
in_reply_to_user_id_str          77 non-null float64
is_quote_status                  2334 non-null bool
lang                             2334 no

In [255]:
categorical_scan(df_tweets)

0 	 contributors
0 	 coordinates
2334 	 created_at
2241 	 entities
1819 	 extended_entities
1995 	 favorite_count
1 	 favorited
0 	 geo
2334 	 id
2334 	 id_str
30 	 in_reply_to_screen_name
76 	 in_reply_to_status_id
76 	 in_reply_to_status_id_str
30 	 in_reply_to_user_id
30 	 in_reply_to_user_id_str
2 	 is_quote_status
9 	 lang
1 	 place
1 	 possibly_sensitive
1 	 possibly_sensitive_appealable
24 	 quoted_status
26 	 quoted_status_id
26 	 quoted_status_id_str
1733 	 retweet_count
1 	 retweeted
165 	 retweeted_status
4 	 source
2334 	 text
2 	 truncated
250 	 user


### Quality
#### `twitter_archive_enhanced` table
- There are "None" as value needs to be corrected.
- There are number of observations have `doggo` with others.
- There are 5 columns, end with `_id`, are int64 & float64 type while they need to be string.
- There are 2 columns, end with `timestamp`, are object type while they need to be time formate.
- `source` column has values in HTML fromat.
- `rating_denominator` has values differ 10.
- `rating_numerator` has anomalous values >= 45.9 (std) which we can consider them as outliers.
- There are 156 tweets as retweets.
- There are unwanted columns of `twitter_archive_enhanced` should be removed.

#### `image_predictions` table
- `id` column should be `string` type.
- There are 324 observations are not predected as a dog.
- There are unwanted columns of `image_predictions` should be removed.

#### `tweets` table
- `id` column should be `string` type.
- There are columns should be droped becuase either they have 1 repeated value or nothing.
- There are unwanted columns of `tweets` should be removed.

### Tidiness
- `doggo`, `floofer`, `pupper` & `puppo` columns of `twitter_archive_enhanced` have to be values in the new column called "type".
- join all tables to create `twitter_archive_master.csv`.

<a id='clean'></a>
<h2 id="-Gather" style="
    background-color: #555;
    color: #eee;
    padding: 10px 5px;
">Clean</h2>

In [256]:
df2_archive = df_archive.copy()
df2_predictions = df_predictions.copy()
df2_tweets = df_tweets.copy()

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">twitter_archive_enhanced table</h3>

## There are "None" as value needs to be corrected.

### Define

> change "None" value to be np.nan

### Code

In [257]:
df2_archive = df2_archive.applymap(lambda x: np.nan if x == 'None' else x)

### Test

In [258]:
# if there is a different of counts, means it is correct
(df_archive.count() == df2_archive.count()).all() != True

True

## There are number of observations have `doggo` with others.

### Define

> merge `doggoe` with others into `multiple` column.

### Code

In [259]:
df2_archive['multiple'] = df2_archive[['doggo','floofer','pupper','puppo']].apply(
    lambda x: ','.join(x.dropna().astype('str')), axis=1
)
df2_archive.multiple = df2_archive.multiple.map(lambda x: np.nan if len(x)==0 else x)

### Test

In [260]:
df2_archive.multiple.value_counts()

pupper           245
doggo             83
puppo             29
doggo,pupper      12
floofer            9
doggo,floofer      1
doggo,puppo        1
Name: multiple, dtype: int64

## There are 5 columns, end with `_id`, are int64 & float64 type while they need to be string.

### Define

> change the type of columns end with `_id` to string.

### Code

In [261]:
for col in df2_archive.columns[df2_archive.columns.str.endswith('_id')]:
    df2_archive[col] = df2_archive[col].fillna(0)
    df2_archive[col] = df2_archive[col].astype('str')

### Test

In [262]:
df2_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 18 columns):
tweet_id                      2356 non-null object
in_reply_to_status_id         2356 non-null object
in_reply_to_user_id           2356 non-null object
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           2356 non-null object
retweeted_status_user_id      2356 non-null object
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          1611 non-null object
doggo                         97 non-null object
floofer                       10 non-null object
pupper                        257 non-null object
puppo                         30 non-null object
multiple                      380 no

## There are 2 columns, end with `timestamp`, are object type while they need to be `timestamp`.

### Define

> change the type of columns end with `timestamp` from object to timestamp. 

### Code

In [263]:
for col in df2_archive.columns[df2_archive.columns.str.endswith('timestamp')]:
    df2_archive[col] = pd.to_datetime(df2_archive[col])

### Test

In [264]:
df2_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 18 columns):
tweet_id                      2356 non-null object
in_reply_to_status_id         2356 non-null object
in_reply_to_user_id           2356 non-null object
timestamp                     2356 non-null datetime64[ns, UTC]
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           2356 non-null object
retweeted_status_user_id      2356 non-null object
retweeted_status_timestamp    181 non-null datetime64[ns, UTC]
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          1611 non-null object
doggo                         97 non-null object
floofer                       10 non-null object
pupper                        257 non-null object
puppo                         30 non-null object
multiple  

## `source` column has values in HTML fromat.

### Define

> correct the value by remvoing HMTL tag

### Code

In [265]:
df2_archive['source'] = df2_archive.source.str.replace('<[^>]*>','')

### Test

In [266]:
df2_archive.source.value_counts()

Twitter for iPhone     2221
Vine - Make a Scene      91
Twitter Web Client       33
TweetDeck                11
Name: source, dtype: int64

## `rating_denominator` has values differ 10.

### Define

> remove all observations have `rating_denominator` differ 10.

### Code

In [267]:
df2_archive.drop(df2_archive[
    df2_archive.rating_denominator != 10
].index, inplace=True)

### Test

In [268]:
# the lengths should be difference
df2_archive.shape[0] != df_archive.shape[0]

True

## `rating_numerator` has anomalous values >= 50 which are outliers.

### Define

> remvoe all observations have `rating_numerator` >= 45.9 (std) which we can consider them as outliers.

### Code

In [269]:
df2_archive.drop(
    df2_archive[df2_archive.rating_numerator >= 50].index
, inplace=True)

### Test

In [270]:
df2_archive.groupby('rating_numerator').rating_numerator.count()

rating_numerator
0       2
1       8
2       9
3      19
4      15
5      37
6      32
7      54
8     102
9     156
10    461
11    463
12    558
13    351
14     54
15      2
17      1
26      1
27      1
Name: rating_numerator, dtype: int64

## There are 156 tweets as retweets.

### Define

> remove all observations which are actually retweets.

### Code

In [271]:
df2_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,multiple
0,892420643555336193,0.0,0.0,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,0.0,0.0,NaT,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,
1,892177421306343426,0.0,0.0,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,0.0,0.0,NaT,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,,
2,891815181378084864,0.0,0.0,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,0.0,0.0,NaT,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,,
3,891689557279858688,0.0,0.0,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,0.0,0.0,NaT,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,,
4,891327558926688256,0.0,0.0,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,0.0,0.0,NaT,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,,


In [272]:
df2_archive.drop(
    df2_archive[
        df2_archive.retweeted_status_id.isin(df2_archive.tweet_id)
    ].index,
    inplace=True
)

### Test

In [273]:
df2_archive.shape[0] != df_archive.shape[0]

True

## There are unwanted columns of `twitter_archive_enhanced` should be removed.

### Define

> rmove `in_reply_to_status_id`, `in_reply_to_user_id`, `retweeted_status_id`, `retweeted_status_user_id`, `retweeted_status_timestamp`, `expanded_urls` & `text` columns.

### Code

In [274]:
df2_archive.drop(
    ['in_reply_to_status_id',
    'in_reply_to_user_id',
    'retweeted_status_id',
    'retweeted_status_user_id',
    'retweeted_status_timestamp',
    'expanded_urls',
    'text'], axis=1, inplace=True
)

### Test

In [275]:
list(df2_archive)

['tweet_id',
 'timestamp',
 'source',
 'rating_numerator',
 'rating_denominator',
 'name',
 'doggo',
 'floofer',
 'pupper',
 'puppo',
 'multiple']

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">image_predictions table</h3>

## `id` column should be `string` type.

### Define

> change the type of `tweet_id` to be string.

### Code

In [303]:
df2_predictions.tweet_id = df2_predictions.tweet_id.astype('str')

### Test

In [304]:
df2_predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1751 entries, 0 to 2073
Data columns (total 10 columns):
tweet_id    1751 non-null object
p1          1751 non-null object
p1_conf     1751 non-null float64
p1_dog      1751 non-null bool
p2          1751 non-null object
p2_conf     1751 non-null float64
p2_dog      1751 non-null bool
p3          1751 non-null object
p3_conf     1751 non-null float64
p3_dog      1751 non-null bool
dtypes: bool(3), float64(3), object(4)
memory usage: 114.6+ KB


## there are 324 observations are not predected as a dog.

### Define

> remove all observations were not predected as a dog.

### Code

In [305]:
df2_predictions.drop(
    df2_predictions.query('p1_dog == False & p2_dog == False & p3_dog == False').index
    , inplace=True
)

### Test

In [306]:
# they should be not equal
df2_predictions.shape[0] != df_predictions.shape[0]

True

## There are unwanted columns of `image_predictions` should be removed.

### Define

> remvoe `jpg_url` & `img_num` columns.

### Code

In [307]:
df2_predictions.drop(['jpg_url', 'img_num'], axis=1, inplace=True)

KeyError: "['jpg_url' 'img_num'] not found in axis"

### Test

In [308]:
list(df2_predictions)

['tweet_id',
 'p1',
 'p1_conf',
 'p1_dog',
 'p2',
 'p2_conf',
 'p2_dog',
 'p3',
 'p3_conf',
 'p3_dog']

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">twitter table</h3>

## `id` column should be `string` type.

### Define

> change the type of `id` to be string.

### Code

In [309]:
df2_tweets.id = df2_tweets.id.astype('str')

### Test

In [310]:
df2_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 3 columns):
favorite_count    2334 non-null int64
id                2334 non-null object
retweet_count     2334 non-null int64
dtypes: int64(2), object(1)
memory usage: 54.8+ KB


## There are columns should be droped becuase either they have 1 repeated value or nothing.

### Define

> remove all columns that have 1 repeatted value or nothing.

### Code

In [311]:
cols = []
for col in df2_tweets.columns:
    if(df2_tweets[col].value_counts().count() <=1):
        cols.append(col)


In [312]:
df2_tweets.drop(cols, axis=1, inplace=True)

### Test

In [313]:
for col in df2_tweets.columns:
    print(df2_tweets[col].value_counts().count(), '\t', col)

1995 	 favorite_count
2334 	 id
1733 	 retweet_count


## There are unwanted columns of `tweets` should be removed.

### Define

> remvoe `id`, `favorite_count` & `retweet_count` columns.

### Code

In [314]:
df2_tweets.drop(
    df2_tweets.columns.difference(['id', 'favorite_count', 'retweet_count']), axis=1, inplace=True
)

### Test

In [315]:
list(df2_tweets)

['favorite_count', 'id', 'retweet_count']

<h3 id="-Gather" style="
    background-color: #bbb;
    color: #fff;
    padding: 10px 5px;
">Tidiness</h3>

In [316]:
df3_predictions = df2_predictions.copy()
df3_archive = df2_archive.copy()
df3_tweets = df2_tweets.copy()

## `doggo`, `floofer`, `pupper` & `puppo` columns of "twitter_archive_enhanced" have to be values in the new column called `type`.


In [317]:
value_vars = ['multiple']
cols = df3_archive.columns.difference(value_vars)

df3_archive = df3_archive.melt(
    id_vars=cols, value_vars=value_vars, value_name='stage'
)

df3_archive.reset_index(inplace=True)
df3_archive.drop('variable', axis=1, inplace=True)

In [318]:
df3_archive.stage.value_counts()

pupper           245
doggo             83
puppo             29
doggo,pupper      12
floofer            9
doggo,floofer      1
doggo,puppo        1
Name: stage, dtype: int64

## join all tables to create `twitter_archive_master.csv`.

In [319]:
df3_tweets.rename({'id':'tweet_id'}, axis=1, inplace=True)

In [320]:
# without reset index, the merge will not working properly

df3_master = df3_archive.merge(
    df3_predictions, how='outer', on='tweet_id'
).merge(
    df3_tweets, how='outer', on='tweet_id'
)

In [321]:
df3_master.to_csv('twitter_archive_master.csv', index=False)