## Wrangle and Analyse Data

In [390]:
#import packages
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import tweepy as tw 
import requests 
import json
import re
import os 

#API key variables stored in separate file 
%run twitter_keys

## Gathering

1. Supplied twitter_enhanced dataset must be loaded from file 
2. Image_prediction dataset must be scraped from Udacity website 
3. Archive tweet data must be queried through twitter API 
   - As we'll be using the tweet_id from the supplied twitter_enhanced dataset, the tweet_id column must first cleaned  before we gather more data to avoid any duplicates, reweets and responses to other tweets, this step will reduce the collection time associated with use of the API as well as avoid duplicates in the third dataset

In [391]:
#load supplied twitter data - csv format 
twitter_enhanced = pd.read_csv('twitter-archive-enhanced.csv')

In [392]:
twitter_enhanced.head(4)
twitter_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [393]:
# check for duplicate tweet_ids
twitter_enhanced[twitter_enhanced.tweet_id.duplicated() == True].tweet_id.count()

0

In [394]:
#check for retweets
retweets = twitter_enhanced[twitter_enhanced.retweeted_status_id.isnull() == False]
retweets.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
19,888202515573088257,,,2017-07-21 01:02:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Canela. She attempted s...,8.87474e+17,4196984000.0,2017-07-19 00:47:34 +0000,https://twitter.com/dog_rates/status/887473957...,13,10,Canela,,,,
32,886054160059072513,,,2017-07-15 02:45:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Athletics: 12/10 #BATP https://t.co/WxwJmv...,8.860537e+17,19607400.0,2017-07-15 02:44:07 +0000,https://twitter.com/dog_rates/status/886053434...,12,10,,,,,
36,885311592912609280,,,2017-07-13 01:35:06 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Lilly. She just paralle...,8.305833e+17,4196984000.0,2017-02-12 01:04:29 +0000,https://twitter.com/dog_rates/status/830583320...,13,10,Lilly,,,,
68,879130579576475649,,,2017-06-26 00:13:58 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Emmy. She was adopted t...,8.780576e+17,4196984000.0,2017-06-23 01:10:23 +0000,https://twitter.com/dog_rates/status/878057613...,14,10,Emmy,,,,
73,878404777348136964,,,2017-06-24 00:09:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Shadow. In an attempt to r...,8.782815e+17,4196984000.0,2017-06-23 16:00:04 +0000,"https://www.gofundme.com/3yd6y1c,https://twitt...",13,10,Shadow,,,,


In [395]:
#remove retweets
twitter_enhanced = twitter_enhanced[twitter_enhanced.retweeted_status_id.isnull()]
twitter_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2175 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2175 non-null object
source                        2175 non-null object
text                          2175 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2117 non-null object
rating_numerator              2175 non-null int64
rating_denominator            2175 non-null int64
name                          2175 non-null object
doggo                         2175 non-null object
floofer                       2175 non-null object
pupper                        2175 non-null object
puppo                         2175 non-null object
dtypes: float64(4), int64(3), object(1

In [396]:
#remove reply tweets
twitter_enhanced = twitter_enhanced[twitter_enhanced.in_reply_to_status_id.isnull()]
twitter_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2097 non-null int64
in_reply_to_status_id         0 non-null float64
in_reply_to_user_id           0 non-null float64
timestamp                     2097 non-null object
source                        2097 non-null object
text                          2097 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2094 non-null object
rating_numerator              2097 non-null int64
rating_denominator            2097 non-null int64
name                          2097 non-null object
doggo                         2097 non-null object
floofer                       2097 non-null object
pupper                        2097 non-null object
puppo                         2097 non-null object
dtypes: float64(4), int64(3), object(10)

In [397]:
#Instantiate tweepy object 
#key variables are declared in private python file, imported as twitter_keys
auth = tw.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)
api = tw.API(auth, parser=tw.parsers.JSONParser())

In [154]:
#query API for tweet data 
#store only the data needed
tweet_data = []
exceptions = []
i = 0
for tweet_id in twitter_enhanced['tweet_id']:
    try:
        tweet = api.get_status(tweet_id, tweet_mode='extended',
                               wait_on_rate_limit = True,
                               wait_on_rate_limit_notify = True)
        date = tweet['created_at']
        favourites = tweet['favorite_count']
        retweets = tweet['retweet_count']
        tweet_data.append({'tweet_id':int(tweet_id),
                           'creation_date':pd.to_datetime(date),
                           'favourites':int(favourites),
                           'retweets':int(retweets)})
    except Exception as e:
        print(str(e) + str(tweet_id))
        exceptions.append(tweet_id)

[{'code': 144, 'message': 'No status found with that ID.'}]872261713294495745
[{'code': 144, 'message': 'No status found with that ID.'}]844704788403113984
[{'code': 144, 'message': 'No status found with that ID.'}]837366284874571778
[{'code': 144, 'message': 'No status found with that ID.'}]829374341691346946


Rate limit reached. Sleeping for: 155


[{'code': 144, 'message': 'No status found with that ID.'}]779123168116150273
[{'code': 144, 'message': 'No status found with that ID.'}]754011816964026368


Rate limit reached. Sleeping for: 297


[{'code': 144, 'message': 'No status found with that ID.'}]680055455951884288


In [398]:
# Create dataframe and store result as csv file
tweets = pd.DataFrame(tweet_data)
tweets.to_json('tweet_json.txt', orient='columns')

In [399]:
#json file seems to be saving datetime as epoch time in milliseconds
tweets = pd.read_json('tweet_json.txt')
tweets.creation_date = pd.to_datetime(tweets.creation_date, unit='ms')
tweets.reset_index(drop=True, inplace=True)
tweets.head()

Unnamed: 0,creation_date,favourites,retweets,tweet_id
0,2017-08-01 16:23:56,36398,7741,892420643555336193
1,2017-08-01 00:17:27,31348,5735,892177421306343426
2,2017-07-26 00:31:25,28900,6729,890006608113172480
3,2017-05-31 23:43:25,34798,7753,870063196459192321
4,2016-03-28 00:43:43,3280,829,714251586676113411


In [400]:
tweets[tweets.tweet_id.duplicated() == True].tweet_id.count()

0

In [425]:
#scrape image_predictions.tsv file from Udacity site 
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)

#write to file 
with open(os.path.join('image_predictions.tsv'), mode = 'wb') as file:
    file.write(response.content)

In [426]:
#load image_predictions - tsv format 
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')

In [427]:
image_predictions.head(1)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True


In [428]:
twitter_enhanced.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2097.0,0.0,0.0,0.0,0.0,2097.0,2097.0
mean,7.365594e+17,,,,,12.189318,10.448736
std,6.710178e+16,,,,,40.364996,6.645061
min,6.660209e+17,,,,,0.0,2.0
25%,6.768197e+17,,,,,10.0,10.0
50%,7.098528e+17,,,,,11.0,10.0
75%,7.877176e+17,,,,,12.0,10.0
max,8.924206e+17,,,,,1776.0,170.0


In [429]:
twitter_enhanced.name.value_counts()

None         603
a             55
Lucy          11
Charlie       11
Cooper        10
Oliver        10
Penny          9
Tucker         9
Lola           8
Sadie          8
the            8
Winston        8
Toby           7
Daisy          7
Oscar          6
Bo             6
Bella          6
Jax            6
Bailey         6
Stanley        6
an             6
Koda           6
Leo            5
Rusty          5
Bentley        5
Chester        5
Dave           5
Milo           5
Buddy          5
Scout          5
            ... 
Eevee          1
Bobble         1
Willy          1
Kellogg        1
Alexander      1
Sundance       1
Storkson       1
Obi            1
Timison        1
Murphy         1
Yoda           1
Sailor         1
such           1
Kevon          1
Zeus           1
JD             1
Gromit         1
Kingsley       1
Devón          1
Ginger         1
Tommy          1
Benny          1
Shelby         1
Socks          1
Brockly        1
Bruno          1
Bauer          1
Mairi         

In [430]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2090 entries, 0 to 2089
Data columns (total 4 columns):
creation_date    2090 non-null datetime64[ns]
favourites       2090 non-null int64
retweets         2090 non-null int64
tweet_id         2090 non-null int64
dtypes: datetime64[ns](1), int64(3)
memory usage: 65.4 KB


In [431]:
tweets.describe()

Unnamed: 0,favourites,retweets,tweet_id
count,2090.0,2090.0,2090.0
mean,8393.688517,2536.132536,7.363484e+17
std,12194.900435,4441.547449,6.702051e+16
min,72.0,11.0,6.660209e+17
25%,1865.0,560.5,6.767853e+17
50%,3837.5,1230.5,7.095381e+17
75%,10473.0,2882.0,7.872326e+17
max,157142.0,78286.0,8.924206e+17


In [432]:
image_predictions.jpg_url.value_counts()

https://pbs.twimg.com/ext_tw_video_thumb/815965888126062592/pu/img/JleSw4wRhgKDWQj5.jpg    2
https://pbs.twimg.com/media/Cq9guJ5WgAADfpF.jpg                                            2
https://pbs.twimg.com/media/CiibOMzUYAA9Mxz.jpg                                            2
https://pbs.twimg.com/media/Ct72q9jWcAAhlnw.jpg                                            2
https://pbs.twimg.com/media/CsVO7ljW8AAckRD.jpg                                            2
https://pbs.twimg.com/media/CVgdFjNWEAAxmbq.jpg                                            2
https://pbs.twimg.com/media/CpmyNumW8AAAJGj.jpg                                            2
https://pbs.twimg.com/tweet_video_thumb/CeBym7oXEAEWbEg.jpg                                2
https://pbs.twimg.com/media/C4bTH6nWMAAX_bJ.jpg                                            2
https://pbs.twimg.com/media/Cp6db4-XYAAMmqL.jpg                                            2
https://pbs.twimg.com/media/Crwxb5yWgAAX5P_.jpg                       

# Assessment
### Quality 

**twitter_enhanced Dataframe**

  1. There are six columns missing values 
  2. Name column has None, a and the, appearing often as names.
  3. if numbers have a decimal, the whole numbers appear to be cut off
  4. The 181 non-null retweet fields constitute non-original tweets in the dataset 
  5. reply columns respond to non-original tweets
  6. timestamp fields are type object not datetime 
  7. source text is wrapped in html tags
  
**twitter API dataframe**

  1. values are missing from several rows in the dataframe
  2. There are less rows overall compared to the twitter_enhanced dataframe, presumably because they were unable to be retrieved from twitter's archive
  3. tweet_id is an integer instead of a string
  
**image_prediction dataframe**
  1. There appears to be duplicate url links for a number of images
  
### Tidiness 
**twitter_enhanced Dataframe**
- retweet columns can be dropped as its non essential
- 181 rows of retweets are non-required
- reply columns are not required
- dog types are categorical and should be reduced to one column 

**twitter API dataframe**
- there are fewer rows in this table than the twitter enhanced table.
- only retweet count and favourite count are needed from this table, the rest can be dropped 
- dataframe needs to be merged with twitter_enhanced data

**image predictions dataframe**
- fewer rows than in twitter_enhanced dataframe
- image_prediction dataframe needs to be merged with twitter_enhanced data

# Cleaning

### Quality issue 1/2 - removing rows that aren't original tweets

we cleaned the retweets and replies earlier from the twitter enhanced dataframe. Those columns can now be dropped

In [433]:
twitter_enhanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2097 non-null int64
in_reply_to_status_id         0 non-null float64
in_reply_to_user_id           0 non-null float64
timestamp                     2097 non-null object
source                        2097 non-null object
text                          2097 non-null object
retweeted_status_id           0 non-null float64
retweeted_status_user_id      0 non-null float64
retweeted_status_timestamp    0 non-null object
expanded_urls                 2094 non-null object
rating_numerator              2097 non-null int64
rating_denominator            2097 non-null int64
name                          2097 non-null object
doggo                         2097 non-null object
floofer                       2097 non-null object
pupper                        2097 non-null object
puppo                         2097 non-null object
dtypes: float64(4), int64(3), object(10)

### Tidiness Issue 1 - dropping unrequired columns from twitter enhanced 

In [434]:
#create copy of dataframe
#drop unrequired columns from twitter_enhanced
twitter_enhanced_clean = twitter_enhanced.copy()
columns = ['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp','expanded_urls']
twitter_enhanced_clean.reset_index(drop=True, inplace=True)
twitter_enhanced_clean.drop(columns, axis=1, inplace=True)
twitter_enhanced_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2097 entries, 0 to 2096
Data columns (total 11 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: int64(3), object(8)
memory usage: 180.3+ KB


In [435]:
#collapse dog stages into one column 
twitter_enhanced_clean['stage'] = twitter_enhanced_clean['text'].str.extract('(doggo|floofer|pupper|puppo)')
columns = ['doggo','floofer','pupper','puppo']
twitter_enhanced_clean.drop(columns, axis=1, inplace=True)

  


In [436]:
twitter_enhanced_clean.head()

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,rating_denominator,name,stage
0,892420643555336193,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,13,10,Phineas,
1,892177421306343426,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,13,10,Tilly,
2,891815181378084864,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,12,10,Archie,
3,891689557279858688,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,13,10,Darla,
4,891327558926688256,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,12,10,Franklin,


### Quality issue 3/4 - fixing values in Name column

In [437]:
#find non-name words (any words beginning with a lowercase letter)
words = []
for name in twitter_enhanced_clean['name']:
    if name[0].islower() and name not in words:
        words.append(name)

In [438]:
print(words)

['such', 'a', 'quite', 'not', 'one', 'incredibly', 'very', 'my', 'his', 'an', 'actually', 'just', 'getting', 'mad', 'this', 'unacceptable', 'all', 'old', 'infuriating', 'the', 'by', 'officially', 'life', 'light', 'space']


In [439]:
#replace occurences with Nan
twitter_enhanced_clean.name.replace(words, np.nan, inplace = True)

In [440]:
#replace None value with Nan
twitter_enhanced_clean.name.replace('None', np.nan, inplace = True)

In [441]:
twitter_enhanced_clean.name.value_counts()

Lucy         11
Charlie      11
Oliver       10
Cooper       10
Penny         9
Tucker        9
Winston       8
Lola          8
Sadie         8
Daisy         7
Toby          7
Bo            6
Jax           6
Oscar         6
Bailey        6
Koda          6
Bella         6
Stanley       6
Louis         5
Buddy         5
Chester       5
Leo           5
Rusty         5
Bentley       5
Milo          5
Dave          5
Scout         5
Winnie        4
Brody         4
Oakley        4
             ..
Willy         1
Kellogg       1
Charleson     1
Dale          1
Hamrick       1
BeBe          1
Raphael       1
Fletcher      1
Shelby        1
Kevon         1
Zeus          1
JD            1
Gromit        1
Kingsley      1
Devón         1
Ginger        1
Tommy         1
Benny         1
Socks         1
Ralph         1
Brockly       1
Bruno         1
Bauer         1
Mairi         1
Mason         1
Obi           1
Storkson      1
Sundance      1
Alexander     1
Alejandro     1
Name: name, Length: 929,

In [442]:
twitter_enhanced_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2097 entries, 0 to 2096
Data columns (total 8 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  1390 non-null object
stage                 353 non-null object
dtypes: int64(3), object(5)
memory usage: 131.1+ KB


### Quality issue 5 - collapsing image_prediction columns and removing non-dog types from DataFrame

In [444]:
image_predictions_clean = image_predictions.copy()
p1 = image_predictions_clean[['tweet_id','p1', 'p1_conf','p1_dog']].copy()
p2 = image_predictions_clean[['tweet_id','p2', 'p2_conf','p2_dog']].copy()
p3 = image_predictions_clean[['tweet_id','p3', 'p3_conf','p3_dog']].copy()

p2.rename(columns={"p2": "p1", "p2_conf":"p1_conf", "p2_dog": "p1_dog"}, inplace=True)
p3.rename(columns={"p3": "p1", "p3_conf":"p1_conf", "p3_dog": "p1_dog"}, inplace=True)
p3.head()

Unnamed: 0,tweet_id,p1,p1_conf,p1_dog
0,666020888022790149,Shetland_sheepdog,0.061428,True
1,666029285002620928,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,bloodhound,0.116197,True
3,666044226329800704,miniature_pinscher,0.222752,True
4,666049248165822465,Doberman,0.154629,True


In [445]:
#aggregate dataframes
p1 = p1.append(p2, ignore_index=True)
p1 = p1.append(p3, ignore_index=True)
p1.rename(columns={'p1':'prediction','p1_conf':'confidence','p1_dog':'is_dog'}, inplace=True)
p1.head()

Unnamed: 0,tweet_id,prediction,confidence,is_dog
0,666020888022790149,Welsh_springer_spaniel,0.465074,True
1,666029285002620928,redbone,0.506826,True
2,666033412701032449,German_shepherd,0.596461,True
3,666044226329800704,Rhodesian_ridgeback,0.408143,True
4,666049248165822465,miniature_pinscher,0.560311,True


In [446]:
#joining the original refactored columns with the image_prediction dataframe
image_predictions_clean = image_predictions_clean.merge(p1, on="tweet_id")

In [447]:
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6225 entries, 0 to 6224
Data columns (total 15 columns):
tweet_id      6225 non-null int64
jpg_url       6225 non-null object
img_num       6225 non-null int64
p1            6225 non-null object
p1_conf       6225 non-null float64
p1_dog        6225 non-null bool
p2            6225 non-null object
p2_conf       6225 non-null float64
p2_dog        6225 non-null bool
p3            6225 non-null object
p3_conf       6225 non-null float64
p3_dog        6225 non-null bool
prediction    6225 non-null object
confidence    6225 non-null float64
is_dog        6225 non-null bool
dtypes: bool(4), float64(4), int64(2), object(5)
memory usage: 607.9+ KB


In [448]:
#drop unrequired columns 
columns = ['p1','p1_conf','p1_dog','p2_dog','p2','p2_conf','p2_dog','p3','p3_conf','p3_dog', 'img_num']
image_predictions_clean.drop(columns, axis=1, inplace=True)
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6225 entries, 0 to 6224
Data columns (total 5 columns):
tweet_id      6225 non-null int64
jpg_url       6225 non-null object
prediction    6225 non-null object
confidence    6225 non-null float64
is_dog        6225 non-null bool
dtypes: bool(1), float64(1), int64(1), object(2)
memory usage: 249.2+ KB


### Quality issue 6 - dropping non-dog rows from image_predictions

In [449]:
#clean and test
image_predictions_clean.drop(image_predictions_clean[image_predictions_clean.is_dog == False].index, inplace=True)
image_predictions_clean.is_dog.value_counts()

True    4584
Name: is_dog, dtype: int64

### Tidiness issue 2 - Merging Dataframes (twitter_enhanced and tweets)

In [450]:
twitter = twitter_enhanced_clean.merge(tweets, on='tweet_id')
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2090 entries, 0 to 2089
Data columns (total 11 columns):
tweet_id              2090 non-null int64
timestamp             2090 non-null object
source                2090 non-null object
text                  2090 non-null object
rating_numerator      2090 non-null int64
rating_denominator    2090 non-null int64
name                  1383 non-null object
stage                 351 non-null object
creation_date         2090 non-null datetime64[ns]
favourites            2090 non-null int64
retweets              2090 non-null int64
dtypes: datetime64[ns](1), int64(5), object(5)
memory usage: 195.9+ KB


In [451]:
#drop duplicate date column and test
twitter.drop(columns=['creation_date'], axis=1, inplace=True)
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2090 entries, 0 to 2089
Data columns (total 10 columns):
tweet_id              2090 non-null int64
timestamp             2090 non-null object
source                2090 non-null object
text                  2090 non-null object
rating_numerator      2090 non-null int64
rating_denominator    2090 non-null int64
name                  1383 non-null object
stage                 351 non-null object
favourites            2090 non-null int64
retweets              2090 non-null int64
dtypes: int64(5), object(5)
memory usage: 179.6+ KB


### Quality issue 7 - changing columns to correct datatypes 

In [452]:
twitter['timestamp'] = pd.to_datetime(twitter['timestamp'])

In [453]:
twitter['stage'] = twitter['stage'].astype('category')

In [454]:
twitter['tweet_id'] = twitter['tweet_id'].astype('str')
image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype('str')

In [455]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2090 entries, 0 to 2089
Data columns (total 10 columns):
tweet_id              2090 non-null object
timestamp             2090 non-null datetime64[ns]
source                2090 non-null object
text                  2090 non-null object
rating_numerator      2090 non-null int64
rating_denominator    2090 non-null int64
name                  1383 non-null object
stage                 351 non-null category
favourites            2090 non-null int64
retweets              2090 non-null int64
dtypes: category(1), datetime64[ns](1), int64(4), object(4)
memory usage: 165.5+ KB


### Quality issue 8 - removing html from source column values

In [456]:
#code
twitter.source = twitter.source.apply(lambda x: re.findall(r'>(.*)<', x)[0])

In [457]:
#test
twitter.source.value_counts()

Twitter for iPhone     1958
Vine - Make a Scene      91
Twitter Web Client       31
TweetDeck                10
Name: source, dtype: int64

## Storing cleaned datasets

In [458]:
twitter.to_csv('twitter_archive_master.csv')
image_predictions_clean.to_csv('image_predictions_master.csv')

# References 
https://stackoverflow.com/questions/13851535/delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression-involving
https://thispointer.com/pandas-apply-apply-a-function-to-each-row-column-in-dataframe/