In [1]:
import numpy as np
import pandas as pd
import requests
import os
import tweepy
import json
from PIL import Image
import re
from io import BytesIO
import re
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

# Gathering Data

In [2]:
archive_df=pd.read_csv('twitter-archive-enhanced.csv')

In [3]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
filename= url.split('/')[-1]
r=requests.get(url)
if not os.path.isfile(filename):
    with open (filename,mode='wb') as imgp_file:
        imgp_file.write(r.content) 
image_predictions_df=pd.read_csv('image-predictions.tsv',sep = "\t")        

In [4]:
df_list = []

with open('tweet-json.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)
        tweet_id = tweet['id']
        retweet_count = tweet['retweet_count']
        fav_count = tweet['favorite_count']
        df_list.append({'tweet_id':tweet_id,
                       'retweet_count': retweet_count,
                       'favorite_count': fav_count})
        
api_df = pd.DataFrame(df_list)

# -----------------------------------------------------------------------------------------

# • Assessing Data Set:

In [5]:
archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [6]:
image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [7]:
api_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   tweet_id        2354 non-null   int64
 1   retweet_count   2354 non-null   int64
 2   favorite_count  2354 non-null   int64
dtypes: int64(3)
memory usage: 55.3 KB


### Tidiness Issues:
`archive_df`
- related rows to retweets and replies (259 row)
- 4 columns of dogs classification
- 5 columns of un needed data related to retweets and replies

`api_df`
- favorites and retweet count columns needed to be merged with the first table to create a unit of tweets' obeservation

### Quality Issues:
`archive_df`
- tweet_id: is int    <font color='blue'>#programatically</font>
- timestamp: is object <font color='blue'>#programatically</font>
- None are represented as string not NaN object <font color='blue'>#programatically</font>
- tweets with empty images (extended_url) <font color='blue'>#programatically</font>
- tweets has two classifications together <font color='blue'>#visually</font>
- some dog names are not extracted correctly <font color='blue'>#visually</font>
- there are (rating_denominator) > 10 which manipulate the scale of its numerator likr rows 903 and 1121 <font color='blue'>#programatically</font>
- source column is not represented in clear naming <font color='blue'>#visually</font>
- tweets with no dog classification <font color='blue'>#visually</font>

`image_predictions_df`
- tweet_id: is int <font color='blue'>#programatically</font>
- there images in this table is not in archive table -visually- <font color='blue'>#visually</font>

`api_df`
- tweet_id: is int -programatically-

# -----------------------------------------------------------------------------------------

# • Cleaning Data Set:

##### <Font color='green'> First we need to make copies of the data in case we needed to iterate through the data again</font>

In [8]:
archive_df_c = archive_df.copy()
image_predictions_df_c = image_predictions_df.copy()
api_df_c = api_df.copy()

##### `archive_df_c`
- timestamp column is object
- tweet_id: is in

##### Define
- change the type of the timestamp column to datetime
- change the type of the tweet_id column type to string/object

##### Code

In [9]:
archive_df_c['timestamp'] = pd.to_datetime(archive_df_c['timestamp'])
archive_df_c['tweet_id'] = archive_df_c['tweet_id'].apply(str)

##### Test

In [10]:
archive_df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    float64            
 7   retweeted_status_user_id    181 non-null    float64            
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • None are represented as string not NaN object  `archive_df_c`
##### Define
change replace 'None' string with NaN value

##### Code

In [11]:
 archive_df_c.replace("None",np.nan,inplace=True)

##### Test

In [12]:
archive_df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     float64            
 2   in_reply_to_user_id         78 non-null     float64            
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    float64            
 7   retweeted_status_user_id    181 non-null    float64            
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • related rows to retweets and replies (259 row)  `archive_df_c`
##### Define
delete records related to replies and retweets (as we need orignal tweets)

##### Code

In [13]:
df_no_reply = archive_df_c.query('in_reply_to_status_id != in_reply_to_status_id')
archive_df_c = df_no_reply.query('retweeted_status_id != retweeted_status_id')

##### Test

In [14]:
archive_df_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2097 non-null   object             
 1   in_reply_to_status_id       0 non-null      float64            
 2   in_reply_to_user_id         0 non-null      float64            
 3   timestamp                   2097 non-null   datetime64[ns, UTC]
 4   source                      2097 non-null   object             
 5   text                        2097 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2094 non-null   object             
 10  rating_numerator            2097 non-null   int64           

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • 5 columns of un needed data related to retweets and replies `archive_df_c`
##### Define
remove columns(in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp)

##### Code

In [15]:
archive_df_c.drop(['in_reply_to_status_id', 'in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp'],axis=1,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


##### Test

In [16]:
archive_df_c.columns

Index(['tweet_id', 'timestamp', 'source', 'text', 'expanded_urls',
       'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer',
       'pupper', 'puppo'],
      dtype='object')

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • tweets with empty images (extended_urls) `archive_df_c`
##### Define
remove rows with empty images

##### Code

In [17]:
archive_df_c = archive_df_c.dropna(how='any', subset=['expanded_urls'])

##### Test

In [18]:
archive_df_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2355
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2094 non-null   object             
 1   timestamp           2094 non-null   datetime64[ns, UTC]
 2   source              2094 non-null   object             
 3   text                2094 non-null   object             
 4   expanded_urls       2094 non-null   object             
 5   rating_numerator    2094 non-null   int64              
 6   rating_denominator  2094 non-null   int64              
 7   name                1494 non-null   object             
 8   doggo               83 non-null     object             
 9   floofer             10 non-null     object             
 10  pupper              229 non-null    object             
 11  puppo               24 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(2

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### -  4 columns of dogs classification
#### - tweets has two classifications together
`archive_df_c`
##### Define
- creating data frames for each dog class
- create a data frame for the non classified
- deleting duplictaes
- merging them

##### Code

In [19]:
archive_df_c_doggo = archive_df_c.query('doggo == doggo')
archive_df_c_doggo.drop(['floofer','pupper','puppo'],axis=1,inplace=True)
archive_df_c_doggo = archive_df_c_doggo.rename(columns={'doggo':'classification'})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [20]:
archive_df_c_floofer = archive_df_c.query('floofer == floofer')
archive_df_c_floofer.drop(['doggo','pupper','puppo'],axis=1,inplace=True)
archive_df_c_floofer = archive_df_c_floofer.rename(columns={'floofer':'classification'})

In [21]:
archive_df_c_pupper = archive_df_c.query('pupper == pupper')
archive_df_c_pupper.drop(['doggo','floofer','puppo'],axis=1,inplace=True)
archive_df_c_pupper = archive_df_c_pupper.rename(columns={'pupper':'classification'})

In [22]:
archive_df_c_puppo = archive_df_c.query('puppo == puppo')
archive_df_c_puppo.drop(['doggo','floofer','pupper'],axis=1,inplace=True)
archive_df_c_puppo = archive_df_c_puppo.rename(columns={'puppo':'classification'})

In [23]:
archive_df_11 = archive_df_c.query('doggo != doggo')
archive_df_12 =  archive_df_11.query('floofer != floofer')
archive_df_13 = archive_df_12.query('pupper != pupper')
archive_df_noclass = archive_df_13.query('puppo != puppo')

In [24]:
archive_df_noclass['classification'] = np.nan 
archive_df_noclass.drop(['doggo', 'floofer','pupper','puppo'],axis=1,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  archive_df_noclass['classification'] = np.nan


In [25]:
archive_df_c2 = pd.concat([archive_df_c_doggo, archive_df_c_floofer,archive_df_c_pupper,archive_df_c_puppo,archive_df_noclass],
                             ignore_index=True)
archive_df_c2 = archive_df_c2[~(archive_df_c2.tweet_id.duplicated())]

##### Test

In [26]:
archive_df_c_doggo.nunique()

tweet_id              83
timestamp             83
source                 3
text                  83
expanded_urls         83
rating_numerator       7
rating_denominator     1
name                  45
classification         1
dtype: int64

In [27]:
archive_df_c_floofer.nunique()

tweet_id              10
timestamp             10
source                 2
text                  10
expanded_urls         10
rating_numerator       4
rating_denominator     1
name                   5
classification         1
dtype: int64

In [28]:
archive_df_c_pupper.nunique()

tweet_id              229
timestamp             229
source                  4
text                  229
expanded_urls         229
rating_numerator       12
rating_denominator      1
name                  127
classification          1
dtype: int64

In [29]:
archive_df_c_puppo.nunique()

tweet_id              24
timestamp             24
source                 2
text                  24
expanded_urls         24
rating_numerator       6
rating_denominator     1
name                  16
classification         1
dtype: int64

In [30]:
archive_df_noclass.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1759 entries, 0 to 2355
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            1759 non-null   object             
 1   timestamp           1759 non-null   datetime64[ns, UTC]
 2   source              1759 non-null   object             
 3   text                1759 non-null   object             
 4   expanded_urls       1759 non-null   object             
 5   rating_numerator    1759 non-null   int64              
 6   rating_denominator  1759 non-null   int64              
 7   name                1296 non-null   object             
 8   classification      0 non-null      float64            
dtypes: datetime64[ns, UTC](1), float64(1), int64(2), object(5)
memory usage: 137.4+ KB


In [31]:
archive_df_c2.nunique()

tweet_id              2094
timestamp             2094
source                   4
text                  2094
expanded_urls         2093
rating_numerator        33
rating_denominator      14
name                   954
classification           4
dtype: int64

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • there are (rating_denominator) > 10 which manipulate the scale of its numerator likr rows 903 and 1121 -programatically- `archive_df_c`
##### Define
- scale down the ratio of the numerator and denominator to equal (something /10) to help us is calculating describtive statistics

##### Code

In [32]:
archive_df_c3 = archive_df_c2.query('rating_denominator > 10')

archive_df_c3

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification
626,820690176645140481,2017-01-15 17:52:40+00:00,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,https://twitter.com/dog_rates/status/820690176...,84,70,,
925,758467244762497024,2016-07-28 01:00:57+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,https://twitter.com/dog_rates/status/758467244...,165,150,,
1039,740373189193256964,2016-06-08 02:41:38+00:00,"<a href=""http://twitter.com/download/iphone"" r...","After so many requests, this is Bretagne. She ...",https://twitter.com/dog_rates/status/740373189...,9,11,,
1077,731156023742988288,2016-05-13 16:15:54+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,https://twitter.com/dog_rates/status/731156023...,204,170,this,
1111,722974582966214656,2016-04-21 02:25:47+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,https://twitter.com/dog_rates/status/722974582...,4,20,,
1141,716439118184652801,2016-04-03 01:36:11+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,https://twitter.com/dog_rates/status/716439118...,50,50,Bluebert,
1166,713900603437621249,2016-03-27 01:29:02+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,https://twitter.com/dog_rates/status/713900603...,99,90,,
1188,710658690886586372,2016-03-18 02:46:49+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,https://twitter.com/dog_rates/status/710658690...,80,80,,
1207,709198395643068416,2016-03-14 02:04:08+00:00,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",https://twitter.com/dog_rates/status/709198395...,45,50,,
1270,704054845121142784,2016-02-28 21:25:30+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Here is a whole flock of puppers. 60/50 I'll ...,https://twitter.com/dog_rates/status/704054845...,60,50,a,


In [33]:
archive_df_c3.nunique()

tweet_id              15
timestamp             15
source                 1
text                  15
expanded_urls         15
rating_numerator      15
rating_denominator    11
name                   4
classification         0
dtype: int64

In [34]:
archive_df_c4=archive_df_c3.copy()
archive_df_c4['rating_numerator']=((10*archive_df_c3['rating_numerator'])/archive_df_c3['rating_denominator'])
archive_df_c4['rating_denominator'] = 10

##### Test

In [35]:
archive_df_c4

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification
626,820690176645140481,2017-01-15 17:52:40+00:00,"<a href=""http://twitter.com/download/iphone"" r...",The floofs have been released I repeat the flo...,https://twitter.com/dog_rates/status/820690176...,12.0,10,,
925,758467244762497024,2016-07-28 01:00:57+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Why does this never happen at my front door......,https://twitter.com/dog_rates/status/758467244...,11.0,10,,
1039,740373189193256964,2016-06-08 02:41:38+00:00,"<a href=""http://twitter.com/download/iphone"" r...","After so many requests, this is Bretagne. She ...",https://twitter.com/dog_rates/status/740373189...,8.181818,10,,
1077,731156023742988288,2016-05-13 16:15:54+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to this unbelievably well behaved sq...,https://twitter.com/dog_rates/status/731156023...,12.0,10,this,
1111,722974582966214656,2016-04-21 02:25:47+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Happy 4/20 from the squad! 13/10 for all https...,https://twitter.com/dog_rates/status/722974582...,2.0,10,,
1141,716439118184652801,2016-04-03 01:36:11+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Bluebert. He just saw that both #Final...,https://twitter.com/dog_rates/status/716439118...,10.0,10,Bluebert,
1166,713900603437621249,2016-03-27 01:29:02+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Happy Saturday here's 9 puppers on a bench. 99...,https://twitter.com/dog_rates/status/713900603...,11.0,10,,
1188,710658690886586372,2016-03-18 02:46:49+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Here's a brigade of puppers. All look very pre...,https://twitter.com/dog_rates/status/710658690...,10.0,10,,
1207,709198395643068416,2016-03-14 02:04:08+00:00,"<a href=""http://twitter.com/download/iphone"" r...","From left to right:\nCletus, Jerome, Alejandro...",https://twitter.com/dog_rates/status/709198395...,9.0,10,,
1270,704054845121142784,2016-02-28 21:25:30+00:00,"<a href=""http://twitter.com/download/iphone"" r...",Here is a whole flock of puppers. 60/50 I'll ...,https://twitter.com/dog_rates/status/704054845...,12.0,10,a,


In [36]:
archive_df_c4.nunique()

tweet_id              15
timestamp             15
source                 1
text                  15
expanded_urls         15
rating_numerator       7
rating_denominator     1
name                   4
classification         0
dtype: int64

Thus this worked, let's normalize this all over our new data frame (archive_df_c2) and creating new one till this stage (archive_df_c5)

In [37]:
archive_df_c5 = archive_df_c2.copy()
archive_df_c5['rating_numerator']=((10*archive_df_c2['rating_numerator'])/archive_df_c2['rating_denominator'])
archive_df_c5['rating_denominator'] = 10
archive_df_c5.nunique()

tweet_id              2094
timestamp             2094
source                   4
text                  2094
expanded_urls         2093
rating_numerator        23
rating_denominator       1
name                   954
classification           4
dtype: int64

In [38]:
archive_df_c5.describe()

Unnamed: 0,rating_numerator,rating_denominator
count,2094.0,2094.0
mean,11.694284,10.0
std,39.687698,0.0
min,0.0,10.0
25%,10.0,10.0
50%,11.0,10.0
75%,12.0,10.0
max,1776.0,10.0


In [39]:
archive_df_c5.query('rating_numerator == 177 ')

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification


In [40]:
archive_df_c5['rating_numerator'][130] = 27
archive_df_c5['rating_numerator'][677] = 34
archive_df_c5['rating_numerator'][783] = 75/3
archive_df_c5['rating_numerator'][976] = 17.7
archive_df_c5['rating_numerator'][1541] = 1776/100
archive_df_c5['rating_numerator'][1829] = 420/12

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  archive_df_c5['rating_numerator'][130] = 27
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  archive_df_c5['rating_numerator'][677] = 34
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  archive_df_c5['rating_numerator'][783] = 75/3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  archive_df_c5['rating_numerator'][

In [41]:
archive_df_c5.describe()

Unnamed: 0,rating_numerator,rating_denominator
count,2094.0,2094.0
mean,10.642792,10.0
std,2.341674,0.0
min,0.0,10.0
25%,10.0,10.0
50%,11.0,10.0
75%,12.0,10.0
max,35.0,10.0


#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • source column is not represented in clear naming -visually- `archive_df_c`
##### Define
- slicing the source column to extract clear source name

##### Code

In [42]:
archive_df_c6 = archive_df_c5.copy()
archive_df_c6['source']=archive_df_c6.source.str.split('>',expand = True)[1]
archive_df_c6['source']=archive_df_c6.source.str.split('<',expand = True)[0]
archive_df_c6.head()

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification
0,890240255349198849,2017-07-26 15:59:51+00:00,Twitter for iPhone,This is Cassie. She is a college pup. Studying...,https://twitter.com/dog_rates/status/890240255...,14.0,10,Cassie,doggo
1,884162670584377345,2017-07-09 21:29:42+00:00,Twitter for iPhone,Meet Yogi. He doesn't have any important dog m...,https://twitter.com/dog_rates/status/884162670...,12.0,10,Yogi,doggo
2,872967104147763200,2017-06-09 00:02:31+00:00,Twitter for iPhone,Here's a very large dog. He has a date later. ...,https://twitter.com/dog_rates/status/872967104...,12.0,10,,doggo
3,871515927908634625,2017-06-04 23:56:03+00:00,Twitter for iPhone,This is Napolean. He's a Raggedy East Nicaragu...,https://twitter.com/dog_rates/status/871515927...,12.0,10,Napolean,doggo
4,871102520638267392,2017-06-03 20:33:19+00:00,Twitter for iPhone,Never doubt a doggo 14/10 https://t.co/AbBLh2FZCH,https://twitter.com/animalcog/status/871075758...,14.0,10,,doggo


##### Test

In [43]:
archive_df_c6.source.value_counts()

Twitter for iPhone     1962
Vine - Make a Scene      91
Twitter Web Client       30
TweetDeck                11
Name: source, dtype: int64

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • favorites and retweet count columns needed to be merged with the first table to create a unit of tweets' obeservation `api_df`
##### Define
- merging `api_df` and `archive_df_c6`

##### Code

In [44]:
api_df['tweet_id'] = api_df['tweet_id'].apply(str)
archive_df_c7 = pd.merge(archive_df_c6, api_df,
                            on=['tweet_id'], how='left')

##### Test

In [45]:
archive_df_c7.head()

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification,retweet_count,favorite_count
0,890240255349198849,2017-07-26 15:59:51+00:00,Twitter for iPhone,This is Cassie. She is a college pup. Studying...,https://twitter.com/dog_rates/status/890240255...,14.0,10,Cassie,doggo,7711,32467
1,884162670584377345,2017-07-09 21:29:42+00:00,Twitter for iPhone,Meet Yogi. He doesn't have any important dog m...,https://twitter.com/dog_rates/status/884162670...,12.0,10,Yogi,doggo,3128,20771
2,872967104147763200,2017-06-09 00:02:31+00:00,Twitter for iPhone,Here's a very large dog. He has a date later. ...,https://twitter.com/dog_rates/status/872967104...,12.0,10,,doggo,5669,28031
3,871515927908634625,2017-06-04 23:56:03+00:00,Twitter for iPhone,This is Napolean. He's a Raggedy East Nicaragu...,https://twitter.com/dog_rates/status/871515927...,12.0,10,Napolean,doggo,3628,20730
4,871102520638267392,2017-06-03 20:33:19+00:00,Twitter for iPhone,Never doubt a doggo 14/10 https://t.co/AbBLh2FZCH,https://twitter.com/animalcog/status/871075758...,14.0,10,,doggo,5764,21461


In [46]:
archive_df_c7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2093
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2094 non-null   object             
 1   timestamp           2094 non-null   datetime64[ns, UTC]
 2   source              2094 non-null   object             
 3   text                2094 non-null   object             
 4   expanded_urls       2094 non-null   object             
 5   rating_numerator    2094 non-null   float64            
 6   rating_denominator  2094 non-null   int64              
 7   name                1494 non-null   object             
 8   classification      335 non-null    object             
 9   retweet_count       2094 non-null   int64              
 10  favorite_count      2094 non-null   int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(3), object(6)
memory usage: 196.3+ KB


In [47]:
archive_df_c7.nunique()

tweet_id              2094
timestamp             2094
source                   4
text                  2094
expanded_urls         2093
rating_numerator        23
rating_denominator       1
name                   954
classification           4
retweet_count         1671
favorite_count        1940
dtype: int64

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • some dog names are not extracted correctly -visually- `archive_df_c`
##### Define
- re-extracting the dogs names form text `archive_df_c7`

##### Code

In [48]:
archive_df_c8 = archive_df_c7.copy()
names=[]
for i in range(len(archive_df_c8)):
    try:
        my_text= archive_df_c8['text'][i]
        text_names = re.findall(r"This is [A-Z][a-z]+|Meet [A-Z][a-z]+|name is [A-Z][a-z]+|hello to [A-Z][a-z]+|named [A-Z][a-z]+|That is [A-Z][a-z]", my_text)[0].split(" ")[-1]
    except:
        text_names = np.nan
    names.append(text_names)
    print (i)
    print(text_names)

0
Cassie
1
Yogi
2
nan
3
Napolean
4
nan
5
Scout
6
nan
7
nan
8
nan
9
Barney
10
Mimosa
11
Meera
12
nan
13
nan
14
Rhino
15
Smiley
16
Miguel
17
Emanuel
18
Pete
19
nan
20
Astrid
21
Doobert
22
Loki
23
Cupid
24
Pilot
25
nan
26
Duchess
27
Sundance
28
nan
29
nan
30
Sunny
31
Bo
32
Dido
33
Chubbs
34
nan
35
nan
36
Rocky
37
nan
38
Bones
39
nan
40
Sobe
41
nan
42
nan
43
Rizzo
44
Pinot
45
Deacon
46
Sampson
47
Combo
48
nan
49
nan
50
Anakin
51
Finley
52
nan
53
Gerald
54
nan
55
Wishes
56
nan
57
Maggie
58
nan
59
nan
60
nan
61
nan
62
nan
63
nan
64
nan
65
nan
66
Piper
67
Boomer
68
Divine
69
Qu
70
Lenox
71
nan
72
nan
73
nan
74
nan
75
nan
76
Kellogg
77
nan
78
Kyle
79
nan
80
nan
81
nan
82
nan
83
Grizzwald
84
Doc
85
nan
86
Blu
87
nan
88
nan
89
Moose
90
nan
91
Petrick
92
Roscoe
93
Gus
94
nan
95
Ginger
96
Jed
97
Sierra
98
Rover
99
Jamesy
100
nan
101
Boomer
102
Pickles
103
nan
104
Clark
105
Ava
106
Gidget
107
nan
108
Kona
109
nan
110
Gabe
111
nan
112
nan
113
Cooper
114
Craig
115
nan
116
Ollie
117
nan
118
nan
119
na

Karll
1085
nan
1086
Sprout
1087
Blitz
1088
Bloop
1089
nan
1090
Colby
1091
Lola
1092
nan
1093
Fred
1094
Kreggory
1095
Sarge
1096
Sugar
1097
Reginald
1098
Ivar
1099
Jangle
1100
nan
1101
Schnitzel
1102
Panda
1103
Archie
1104
Berkeley
1105
nan
1106
Ralph
1107
Charleson
1108
Neptune
1109
Harnold
1110
Sid
1111
Lucy
1112
Pippa
1113
Sadie
1114
Otis
1115
nan
1116
Carper
1117
nan
1118
Bowie
1119
nan
1120
Alexanderson
1121
Suki
1122
Barclay
1123
nan
1124
Skittle
1125
Ebby
1126
Fl
1127
Link
1128
Jennifur
1129
nan
1130
Bluebert
1131
Stephanus
1132
Bubbles
1133
nan
1134
nan
1135
Bentley
1136
Toby
1137
Zeus
1138
Bertson
1139
Oscar
1140
Nico
1141
Michelangelope
1142
Siba
1143
Calbert
1144
nan
1145
Curtis
1146
Benedict
1147
nan
1148
Blitz
1149
Travis
1150
Thumas
1151
nan
1152
nan
1153
Kanu
1154
Doug
1155
nan
1156
Piper
1157
nan
1158
Lance
1159
Opie
1160
Stubert
1161
nan
1162
Sunny
1163
Kane
1164
nan
1165
Steven
1166
Olive
1167
Chester
1168
Roosevelt
1169
nan
1170
nan
1171
Gary
1172
nan
1173
nan
1174
Mi

nan
2085
nan
2086
nan
2087
nan
2088
nan
2089
nan
2090
nan
2091
nan
2092
nan
2093
nan


In [49]:
archive_df_c8['name']=names
archive_df_c8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2094 entries, 0 to 2093
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2094 non-null   object             
 1   timestamp           2094 non-null   datetime64[ns, UTC]
 2   source              2094 non-null   object             
 3   text                2094 non-null   object             
 4   expanded_urls       2094 non-null   object             
 5   rating_numerator    2094 non-null   float64            
 6   rating_denominator  2094 non-null   int64              
 7   name                1415 non-null   object             
 8   classification      335 non-null    object             
 9   retweet_count       2094 non-null   int64              
 10  favorite_count      2094 non-null   int64              
dtypes: datetime64[ns, UTC](1), float64(1), int64(3), object(6)
memory usage: 276.3+ KB


##### Test

In [50]:
print(len(names))

2094


In [51]:
archive_df_c8.query('tweet_id == "704054845121142784"')

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,classification,retweet_count,favorite_count
1259,704054845121142784,2016-02-28 21:25:30+00:00,Twitter for iPhone,Here is a whole flock of puppers. 60/50 I'll ...,https://twitter.com/dog_rates/status/704054845...,12.0,10,,,1028,3201


In [52]:
archive_df_c8.iloc[1259]

tweet_id                                             704054845121142784
timestamp                                     2016-02-28 21:25:30+00:00
source                                               Twitter for iPhone
text                  Here is a whole flock of puppers.  60/50 I'll ...
expanded_urls         https://twitter.com/dog_rates/status/704054845...
rating_numerator                                                     12
rating_denominator                                                   10
name                                                                NaN
classification                                                      NaN
retweet_count                                                      1028
favorite_count                                                     3201
Name: 1259, dtype: object

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

#### • tweet_id: is int -programatically- `image_predictions_df`
##### Define
- change the type of the tweet_id column type to string/object

##### Code

In [53]:
image_predictions_df_c['tweet_id'] = image_predictions_df_c['tweet_id'].apply(str)

##### Test

In [54]:
image_predictions_df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

# • Storing Data Set:

`archive_df`

In [55]:
archive_df_c8.to_csv('twitter_archive_master.csv',index = False)

`image_predictions_df`

In [56]:
image_predictions_df_c.to_csv('image_predictions_new.csv',index = False)

#### <Font color='green'> ----------------------------------------------------------------------------------------- </font>

# • For Visualizing Part check the file named:
## Wrangle_act- Visualizing.ipynb