<a href="https://colab.research.google.com/github/bundickm/trump_twitter_archive/blob/master/trump_tweets_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Source: [TrumpTwitterArchive](http://www.trumptwitterarchive.com/archive)

In [0]:
#basic importing and aliasing
import numpy as np
import pandas as pd

#prevent dataframe truncation by adjusting pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [0]:
#uploading a file
from google.colab import files
upload = files.upload()

Saving trump_tweets.csv to trump_tweets (1).csv


In [0]:
tweets = pd.read_csv('trump_tweets.csv')
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34414 entries, 0 to 34413
Data columns (total 7 columns):
source            34414 non-null object
text              34413 non-null object
created_at        34410 non-null object
retweet_count     34410 non-null float64
favorite_count    34410 non-null object
is_retweet        34352 non-null object
id_str            34407 non-null float64
dtypes: float64(2), object(5)
memory usage: 1.8+ MB


In [0]:
source_map = {'Twitter for Android':'Android',
              'Twitter Web Client':'Web Client',
              'Twitter for iPhone':'iPhone',
              'TweetDeck':'TweetDeck',
              'TwitLonger Beta':'TwitLonger',
              'Media Studio':'Media Studio',
              'Instagram':'Instagram',
              'Facebook':'Facebook',
              'Twitter Ads':'Twitter Ads',
              'Twitter for BlackBerry':'Blackberry',
              'Twitter for iPad':'iPad',
              'Mobile Web (M5)':'iOs, BB, or Android',
              'Twitter Media Studio':'Twitter Media Studio',
              'Twitlonger':'TwitLonger',
              'Twitter QandA':'Twitter QandA',
              'Vine - Make a Scene':'Vine',
              'Periscope':'Periscope',
              'Neatly For BlackBerry 10':'Blackberry',
              'Twitter Mirror for iPad':'iPad',
              'Twitter for Websites':'Web Client'}
tweets['source'] = tweets['source'].map(source_map)
tweets['source'].value_counts()

Android                 12898
Web Client              11298
iPhone                   8656
TweetDeck                 476
TwitLonger                355
Media Studio              154
Instagram                 127
Facebook                   98
Twitter Ads                97
Blackberry                 92
iPad                       58
iOs, BB, or Android        55
Twitter Media Studio       21
Twitter QandA              10
Vine                        8
Periscope                   7
Name: source, dtype: int64

In [0]:
#look at all tweets that may have an incorrect datetime format by 
#pulling anything before 2010(does not contain '201').
tweets['created_at'] = tweets['created_at'].astype(str)
date_time_index = list(np.where(tweets['created_at'].str.contains('201') == False))
date_time_index
for index in date_time_index:
  problem_tweets = tweets.loc[index]
  
problem_tweets.head(64)

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
33891,TweetDeck,My interview which recently aired on CNBC's Sq...,,,,,
33892,,09-19-2011 17:54:28,18,7.0,false,115846232714391552,
34015,TweetDeck,http://goo.gl/AMNEE Countdown to @AmericaNowRa...,,,,,
34016,,08-09-2011 19:33:31,20,3.0,false,101013255950053376,
34039,TweetDeck,http://bit.ly/pwgGsQ,,,,,
34040,,,,,,,
34041,,08-02-2011 19:56:31,13,12.0,false,98482327390396416,
34358,Web Client,From Donald Trump: Wishing everyone a wonderfu...,12-23-2009 17:38:18,28.0,12,false,6971080000.0
34359,Web Client,Trump International Tower in Chicago ranked 6t...,12-03-2009 19:39:09,33.0,6,false,6312794000.0
34360,Web Client,Wishing you and yours a very Happy and Bountif...,11-26-2009 19:55:38,13.0,11,false,6090840000.0


In [0]:
#drop the problem tweets identified and convert to datetime
tweets = tweets.drop([33891,33892,34015,34016,34039,34040,34041])
tweets['created_at'] = pd.to_datetime(tweets['created_at'])
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34407 entries, 0 to 34413
Data columns (total 7 columns):
source            34407 non-null object
text              34407 non-null object
created_at        34407 non-null datetime64[ns]
retweet_count     34407 non-null float64
favorite_count    34407 non-null object
is_retweet        34349 non-null object
id_str            34407 non-null float64
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 2.1+ MB


In [0]:
#Assign nulls in 'is_retweet' to False as all read as though...
#they originate from Trump
retweet_nulls = tweets['is_retweet'].index[tweets['is_retweet'].isnull()]
tweets['is_retweet'].loc[retweet_nulls] = False
tweets.loc[retweet_nulls]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
4383,iPhone,Thanks to all of the Republican and Democratic...,2018-01-09 22:51:50,11718.0,58068,False,9.508627e+17
4384,iPhone,It was my great honor to sign H.R. 267 the “Ma...,2018-01-09 20:17:17,17249.0,73554,False,9.508238e+17
4385,iPhone,On behalf of the American people THANK YOU to ...,2018-01-09 17:11:23,26793.0,112613,False,9.50777e+17
4386,iPhone,We are fighting for our farmers for our countr...,2018-01-08 23:19:47,22044.0,104968,False,9.505074e+17
4387,iPhone,In every decision we make we are honoring Amer...,2018-01-08 22:59:37,17769.0,81027,False,9.505023e+17
4388,iPhone,We have been working every day to DELIVER for ...,2018-01-08 22:58:13,11561.0,57809,False,9.505019e+17
4389,iPhone,Can’t wait to be back in the amazing state of ...,2018-01-08 19:06:40,10242.0,54355,False,9.504437e+17
4390,iPhone,African American unemployment is the lowest ev...,2018-01-08 14:20:25,34648.0,132122,False,9.503716e+17
4391,iPhone,...Clinton in the WH doubling down on Barack O...,2018-01-08 03:24:23,18904.0,83265,False,9.502065e+17
4392,iPhone,“His is turning out to be an enormously conseq...,2018-01-08 03:23:39,16880.0,82631,False,9.502063e+17


In [0]:
tweets.isnull().sum()

source            0
text              0
created_at        0
retweet_count     0
favorite_count    0
is_retweet        0
id_str            0
dtype: int64

In [0]:
tweets.to_csv('trump_tweets_as_of_03-22-2019.csv')
files.download('trump_tweets_as_of_03-22-2019.csv')