# Gathering

In [1]:
# import packages
import pandas as pd
import numpy as np
import requests
import tweepy as tp
import json

In [2]:
# load csv file into df
df = pd.read_csv('twitter-archive-enhanced.csv')
df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [3]:
# download 'image-predictions.tsv'
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
with open('image-predictions.tsv', mode='wb') as file:
    file.write(r.content)

In [4]:
# read 'image-predictions.tsv' in a df
df2 = pd.read_csv('image-predictions.tsv', sep='\t')
df2.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [None]:
# connect to the twitter api
consumer_key = 'xxxx xxxx xxxx xxxx'
consumer_secret = 'xxxx xxxx xxxx xxxx'
access_token = 'xxxx xxxx xxxx xxxx'
access_secret = 'xxxx xxxx xxxx xxxx'

auth = tp.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tp.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
# retreives a list of dictionaries from the twitter api, with informations about each tweets id, favorite count, and retweet count
df_list = []
errors = []
for id in df['tweet_id']:
    try:
        tweet = api.get_status(id, tweet_mode='extended')
        df_list.append({'tweet_id': str(tweet.id),
                        'favorite_count': int(tweet.favorite_count),
                        'retweet_count': int(tweet.retweet_count)})
    except Exception as e:
        print(str(id) + " : " + str(e))
        errors.append(id)

In [None]:
# returns the number of unretreivable id's
len(errors)

In [None]:
# saves the list of dictionaries in a json formatted txt file
with open('tweet_json.txt', 'w') as outfile:  
    json.dump(df_list, outfile)

In [5]:
# loads the json file into a dataframe
with open('tweet_json.txt', 'r') as file:
    df3 = pd.DataFrame(json.load(file), columns=['tweet_id', 'favorite_count', 'retweet_count'])

In [6]:
df3.head()

Unnamed: 0,tweet_id,favorite_count,retweet_count
0,892420643555336193,38302,8404
1,892177421306343426,32827,6196
2,891815181378084864,24720,4100
3,891689557279858688,41634,8531
4,891327558926688256,39797,9249


# Assessing the Data

In [7]:
df.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
1531,690015576308211712,,,2016-01-21 03:38:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This pupper can only sleep on shoes. It's a cr...,,,,https://twitter.com/dog_rates/status/690015576...,12,10,,,,pupper,
1674,682259524040966145,,,2015-12-30 17:58:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He's an Iglesias Hufflepoof. Quite t...,,,,https://twitter.com/dog_rates/status/682259524...,9,10,Jax,,,,
521,809920764300447744,,,2016-12-17 00:38:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...","Please only send in dogs. We only rate dogs, n...",,,,https://twitter.com/dog_rates/status/809920764...,10,10,,,,,
211,851953902622658560,,,2017-04-12 00:23:33 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Astrid. She's a guide d...,8.293743e+17,4196984000.0,2017-02-08 17:00:26 +0000,https://twitter.com/dog_rates/status/829374341...,13,10,Astrid,doggo,,,
2267,667524857454854144,,,2015-11-20 02:08:22 +0000,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Another topnotch dog. His name is Big Jumpy Ra...,,,,https://twitter.com/dog_rates/status/667524857...,12,10,,,,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [9]:
# checking rating denominators
df['rating_denominator'].value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [10]:
# checking rating numerators
df['rating_numerator'].value_counts()

12      558
11      464
10      461
13      351
9       158
8       102
7        55
14       54
5        37
6        32
3        19
4        17
1         9
2         9
420       2
0         2
15        2
75        2
80        1
20        1
24        1
26        1
44        1
50        1
60        1
165       1
84        1
88        1
144       1
182       1
143       1
666       1
960       1
1776      1
17        1
27        1
45        1
99        1
121       1
204       1
Name: rating_numerator, dtype: int64

In [11]:
# inspecting the reason for extremely high rating numerators
for i in df[df['rating_numerator'] > 20]['text']:
    print(i)

@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research
@s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10
@markhoppus 182/10
@jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho
RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu…
The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd
Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. 
Keep Sam smiling by clicking and sharing this link:
https://t.co/98tB8y7y7t https://t.co/LouL5vdvxx
This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS
This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 wo

In [12]:
# checks for duplicated entries
df[df.tweet_id.duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [13]:
df['name'].value_counts()[:10]

None       745
a           55
Charlie     12
Oliver      11
Lucy        11
Cooper      11
Lola        10
Tucker      10
Penny       10
Bo           9
Name: name, dtype: int64

In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [15]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341 entries, 0 to 2340
Data columns (total 3 columns):
tweet_id          2341 non-null object
favorite_count    2341 non-null int64
retweet_count     2341 non-null int64
dtypes: int64(2), object(1)
memory usage: 54.9+ KB


In [16]:
df3.retweet_count.describe()

count     2341.000000
mean      2961.133276
std       4971.707929
min          0.000000
25%        595.000000
50%       1382.000000
75%       3452.000000
max      84382.000000
Name: retweet_count, dtype: float64

### Quality Issues (to be fixed):
- <s>the rating denominators are inconsistent</s>
- <s>some rating numerators are relatively high</s>
- <s>a high number of entries in df are retweets or replies</s>
- <s>a number of entries with 'name' being 'a', 'an', 'the'</s>
- <s>missing data in 'expanded_urls' in df</s>
- <s>dog's stages should be categorical data</s>
- <s>missing dog names in df, recorded as 'None' string instead of NaN</s>
- <s>inconsistency in dog breeds in df2</s>
- <s>'tweet_id' columns should be string types</s>
- <s>timestamp column should be of datetime type</s>

### Tidiness Issues (to be fixed):
- <s>tables are seperated, although they contain the same observations</s>
- <s>the variable for the dog's stage (e.g. doggo, floofer, pupper etc.) is spread in different columns</s>

# Cleaning (tidiness)

- tables are seperated, although they contain the same observations

**Definition:** Joining the 3 dataframes in one master dataframe on the 'tweet_id' primary key. Adding the predicted dog breeds, favorite counts, and retweet counts.

In [17]:
df2.head(1)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True


In [18]:
# creates a predicted dog breed column, based on the the confidence level of minimum 20% and 'is dog' statements
df2['breed_pred'] = [i['p1'] if i['p1_dog'] == True and i['p1_conf'] > 0.2 
                     else i['p2'] if i['p2_dog'] == True and i['p2_conf'] > 0.2
                     else i['p3'] if i['p3_dog'] == True and i['p3_conf'] > 0.2
                     else np.nan for index, i in df2.iterrows()]

In [19]:
# testing
df2.breed_pred.value_counts()[:10]

golden_retriever      152
Labrador_retriever     99
Pembroke               90
Chihuahua              79
pug                    56
Samoyed                44
chow                   41
toy_poodle             39
Pomeranian             37
malamute               30
Name: breed_pred, dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [21]:
# join 'breed_pred' from df2 with df into a master df, keep only relevant collumns of both dataframes:
table1 = df[['tweet_id', 'in_reply_to_status_id', 'retweeted_status_id', 'timestamp', 'text', 'expanded_urls', 'rating_numerator', 'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo']]
table2 = df2[['tweet_id', 'breed_pred']]

df_master = pd.merge(table1, table2, on=['tweet_id'], how='left')

In [22]:
# test
df_master.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,retweeted_status_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,breed_pred
0,892420643555336193,,,2017-08-01 16:23:56 +0000,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,,Chihuahua
2,891815181378084864,,,2017-07-31 00:18:03 +0000,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,,Chihuahua


In [23]:
# convert the tweet id column in df3 into int64 type for merging purposes
df3['tweet_id'] = df3['tweet_id'].astype('int64')
# join the 'favorite count' and 'retweet count' columns from df3 on the master df:
df_master = pd.merge(df_master, df3, on=['tweet_id'], how='left')
# return the tweet id column dtypes into object
df_master['tweet_id'] = df_master['tweet_id'].astype('object')

In [24]:
# test
df_master.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,retweeted_status_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,breed_pred,favorite_count,retweet_count
0,892420643555336193,,,2017-08-01 16:23:56 +0000,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,,38302.0,8404.0
1,892177421306343426,,,2017-08-01 00:17:27 +0000,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,,Chihuahua,32827.0,6196.0
2,891815181378084864,,,2017-07-31 00:18:03 +0000,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,,Chihuahua,24720.0,4100.0


- The variable for the dog's stage (e.g. doggo, floofer, pupper etc.) is spread in different columns

**Definition:** Create a new categorical column, which defines, wether the dog has been described as doggo, floofer/floof, pupper, or puppo

In [25]:
# create the new 'stage' column
df_master['stage'] = ['doggo' if 'doggo' in i
                     else 'pupper' if 'pupper' in i
                     else 'floofer' if 'floofer' in i
                     else 'floof' if 'floof' in i
                     else 'puppo' if 'puppo' in i
                     else np.nan for i in df_master['text'].str.lower()]
# turn into categorical
df_master['stage'] = df_master['stage'].astype('category')
# drop other columns
df_master.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

In [26]:
# test
df_master['stage'].dtype

CategoricalDtype(categories=['doggo', 'floof', 'floofer', 'pupper', 'puppo'], ordered=False)

In [27]:
df_master.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,retweeted_status_id,timestamp,text,expanded_urls,rating_numerator,rating_denominator,name,breed_pred,favorite_count,retweet_count,stage
0,892420643555336193,,,2017-08-01 16:23:56 +0000,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,38302.0,8404.0,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,Chihuahua,32827.0,6196.0,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,Chihuahua,24720.0,4100.0,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,41634.0,8531.0,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,basset,39797.0,9249.0,


# Cleaning (quality)

- missing dog names in df, recorded as 'None' string instead of NaN
- a number of entries in `name` being 'a', 'an', 'the'

**Definition:** 
- replace 'None' with NaN in `name` column
- replace 'a', 'an', 'the' with NaN in `name` column

In [28]:
df_master['name'] = df_master['name'].replace('None', np.nan)

In [29]:
# test
assert sum(df_master['name'] == 'None') == 0

In [30]:
df_master['name'] = df_master['name'].replace(['a', 'an', 'the'], np.nan)

In [31]:
# test
assert sum(df_master['name'] == 'a') + sum(df_master['name'] == 'an') + sum(df_master['name'] == 'a') == 0

- inconsistency in dog breeds

**Definition:** replace all '_' and '-' with spaces, and everything in lower cases. 

In [32]:
df_master['breed_pred'] = df_master['breed_pred'].astype('str')
df_master['breed_pred'] = df_master['breed_pred'].str.lower().str.replace('_', ' ').str.replace('-', ' ')
# NaN's back to non strings
df_master['breed_pred'] = df_master['breed_pred'].replace('nan', np.nan)

In [33]:
# test (first 10 rows)
df_master['breed_pred'].value_counts()[:10]

golden retriever      152
labrador retriever     99
pembroke               90
chihuahua              79
pug                    56
samoyed                44
chow                   41
toy poodle             39
pomeranian             37
malamute               30
Name: breed_pred, dtype: int64

- missing data in 'expanded_urls' in df

**Definition:** for the missing entries, generate the url using the tweet_id's

In [34]:
df_master['expanded_urls'] = [str('https://twitter.com/dog_rates/status/' + str(id)) 
                              for id, url in zip(df_master['tweet_id'], df_master['expanded_urls'])]

In [35]:
# test
assert df_master['expanded_urls'].isnull().any() == False

- a high number of entries in df are retweets or replies

**Definition:** Identify retweet and reply rows, using regex in `text` column. Delete the rows, which contain retweets.

In [36]:
# drop the rows
df_master = df_master.drop(df_master[(df_master['in_reply_to_status_id'].isnull() == False) | (df_master['retweeted_status_id'].isnull() == False)].index)
# also drop the reply status and retweet status columns
df_master = df_master.drop(['in_reply_to_status_id', 'retweeted_status_id'], axis=1)

In [37]:
# test
df_master.shape

(2097, 11)

- the rating denominators are inconsistent
- some rating numerators are relatively high

**Definition:** I figured that some of the dog ratings are exaggerated, eg. 420/10, and would distort the hole analysis, since rating are usually between 10/10 and 15/10. Those rows will be dropped and excluded from the analysis.

Then some of the ratings have denominators other than the usual */10. I figured, that this can be the case for a group of dogs (e.g. a group of 5 dogs with 11/10 rating each would result in a 55/50 rating). If that is  the case, the rating will be standardized to a denominator of 10.

In [38]:
# standardizing to a denominator of 10 for groups of dogs:
df_master['rating_num'] = [int(round(num/(denom/10)))  if denom != 10 and num/denom <= 2 
                           else num for num, denom in zip(df_master['rating_numerator'], df_master['rating_denominator'])]
df_master['rating_denom'] = [10 if denom != 10 and num/denom <= 2
                             else denom for num, denom in zip(df_master['rating_numerator'], df_master['rating_denominator'])]
# drop other columns
df_master = df_master.drop(['rating_numerator', 'rating_denominator'], axis=1)

In [39]:
# test denominators
df_master['rating_denom'].unique()

array([10,  7], dtype=int64)

In [40]:
# test numerators
df_master['rating_num'].unique()

array([  13,   12,   14,    5,   11,    6,   10,    0,   24,   75,   27,
          3,    7,    8,    9,    4, 1776,    2,   26,    1,  420],
      dtype=int64)

Most of the high numerators have been standardized in the process, but there are still some outliers. Proceeding with removing the rows which couldn't be standardized and the rows which contain unusual high ratings (chosen as above 20/10)

In [41]:
df_master = df_master.drop(df_master[((df_master['rating_denom'] != 10) | (df_master['rating_num'] > 20))].index)

In [42]:
# test
df_master['rating_denom'].unique(), df_master['rating_num'].unique()

(array([10], dtype=int64),
 array([13, 12, 14,  5, 11,  6, 10,  0,  3,  7,  8,  9,  4,  2,  1],
       dtype=int64))

- timestamp column should be of datetime type

**Definition:** Change the `timestamp` column to datetime format with the to_datetime function

In [43]:
df_master['timestamp'] = pd.to_datetime(df_master['timestamp'], format='%Y-%m-%d %H:%M:%S')

In [44]:
# test
df_master['timestamp'].describe()

count                    2091
unique                   2091
top       2016-09-12 15:10:21
freq                        1
first     2015-11-15 22:32:08
last      2017-08-01 16:23:56
Name: timestamp, dtype: object

In [45]:
df_master.sample(5)

Unnamed: 0,tweet_id,timestamp,text,expanded_urls,name,breed_pred,favorite_count,retweet_count,stage,rating_num,rating_denom
1258,710283270106132480,2016-03-17 01:55:02,This is Gunner. He's a Figamus Newton. King of...,https://twitter.com/dog_rates/status/710283270...,Gunner,shih tzu,2244.0,552.0,,11,10
238,847116187444137987,2017-03-29 16:00:12,Unbelievable... We. Only. Rate. Dogs. Please s...,https://twitter.com/dog_rates/status/847116187...,,,22348.0,3400.0,,11,10
454,818536468981415936,2017-01-09 19:14:36,This is Tom. He's a silly dog. Known for his u...,https://twitter.com/dog_rates/status/818536468...,Tom,,11696.0,2730.0,,11,10
758,778408200802557953,2016-09-21 01:39:11,RIP Loki. Thank you for the good times. You wi...,https://twitter.com/dog_rates/status/778408200...,,pembroke,14653.0,4791.0,,14,10
2245,667885044254572545,2015-11-21 01:59:37,Meet Stu. Stu has stacks on stacks and an eye ...,https://twitter.com/dog_rates/status/667885044...,Stu,,837.0,504.0,,10,10


In [46]:
# save the master dataframe to a csv file
df_master.to_csv('twitter_archive_master.csv', index=False)