In [1]:
import pandas as pd
import numpy as np
import datetime as dt

input_trump = 'trumparchive_may_rest.csv'  # name of input file found in 'data' folder
output_trump = 'trump_clean.csv'  # may already exist, this is the main dataset which we will add to

df = pd.read_csv('data//'+input_trump, encoding='utf-8')  # read csv containing data from trump twitter archive

num_tweets_read = df.shape[0] # number of tweets ingested
print(dt.datetime.today().strftime('%b-%d-%Y %H:%M:%S EST') + ' - ' + str(num_tweets_read) +
      ' tweets read from file \"' + input_trump + '\".')  # log message
df.head()

Jun-01-2020 18:40:15 EST - 129 tweets read from file "trumparchive_may_rest.csv".


Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,RT @realDonaldTrump: The Lamestream Media is d...,06-01-2020 00:10:27,70675,0,True,1267247098648559620
1,Twitter for iPhone,FAKE NEWS!,06-01-2020 00:09:21,56186,350187,False,1267246821597941764
2,Twitter for iPhone,RT @realDonaldTrump: The United States of Amer...,05-31-2020 22:52:42,214947,0,True,1267227532455219200
3,Twitter for iPhone,LAW &amp; ORDER!,05-31-2020 22:52:09,65809,344069,False,1267227396341669889
4,Twitter for iPhone,RT @realDonaldTrump: Much more “disinformation...,05-31-2020 22:09:57,39261,0,True,1267216774304497669


In [2]:
# rename columns to fit our schema
df.rename(columns = {'source':'Source', 'text':'Tweets', 'created_at':'Date', 'retweet_count':'RTs',
                           'is_retweet':'isRT', 'favorite_count':'Favourites'}, inplace = True)

num_RTs_orig = df['isRT'][df['isRT']==True].count()
df['isRT'] = df['Tweets'].apply(func = lambda t: t.startswith('RT'))
num_RTs_added = df['isRT'][df['isRT']==True].count() - num_RTs_orig

print(dt.datetime.today().strftime('%b-%d-%Y %H:%M:%S EST') + ' - ' + str(num_RTs_added) + ' additional RTs flagged.')

Jun-01-2020 18:40:15 EST - 0 additional RTs flagged.


In [3]:
import os

df['Date'] = pd.to_datetime(df['Date'])  # format date properly
df['Month'] = df['Date'].dt.strftime('%B')  # create month column for trump tweets

df = df.loc[df['isRT']==False].copy()  # remove retweets
num_RTs = num_tweets_read - df.shape[0]  # (number of tweets read) - (number remaining after removing RTs)
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_RTs) + ' RTs removed.')

if os.path.exists('data//' + output_trump):  # if there is existing data
    main_data = pd.read_csv('data//' + output_trump, encoding='utf-8')  # read csv of existing data
    num_rows_main = main_data.shape[0]  # number of rows in the main dataset before adding to it
    updated_df = pd.concat([main_data, df])  # update data by appending new tweets
    updated_df.reset_index(drop=True)
else:
    num_rows_main = 0
    print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - Adding data for the first time to data/" + output_trump)
    num_new_rows = df.shape[0]
    updated_df = df

temp_num_rows = updated_df.shape[0]  # to calculate how many rows we drop in the next line
updated_df.drop_duplicates(subset = ['id_str'], keep = 'first', inplace=True)  # drop duplicates in case old tweets were downloaded mistakenly
num_rows_updated = updated_df.shape[0]  # number of rows in the updated main dataset
num_rows_dropped = temp_num_rows - num_rows_updated
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_rows_dropped) + " rows dropped.")
num_new_rows = num_rows_updated - num_rows_main  # number of rows added to the main dataset
    
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_new_rows) + " new rows saved to " + output_trump)
updated_df.to_csv('data//trump_clean.csv', encoding='utf-8', index=False)  # save updated data as csv

if os.path.exists('data//' + input_trump):
    source_path = os.path.join('data', input_trump)
    target_path = os.path.join('data','old', input_trump)
    os.rename(source_path, target_path)  # move the input data file since we don't need it anymore

df.head()

Jun-01-2020 18:40:15 EST - 57 RTs removed.
Jun-01-2020 18:40:15 EST - 35 rows dropped.
Jun-01-2020 18:40:15 EST - 37 new rows saved to trump_clean.csv


Unnamed: 0,Source,Tweets,Date,RTs,Favourites,isRT,id_str,Month
1,Twitter for iPhone,FAKE NEWS!,2020-06-01 00:09:21,56186,350187,False,1267246821597941764,June
3,Twitter for iPhone,LAW &amp; ORDER!,2020-05-31 22:52:09,65809,344069,False,1267227396341669889,May
5,Twitter for iPhone,Our National Guard stopped them cold last nigh...,2020-05-31 22:08:51,26572,111826,False,1267216497686056960,May
9,Twitter for iPhone,SO TERRIBLE! Where are the arrests and LONG TE...,2020-05-31 20:54:41,25468,93821,False,1267197835067502592,May
10,Twitter for iPhone,STRENGTH! https://t.co/xHE5AdK9Ka,2020-05-31 20:20:39,27148,85417,False,1267189270353186821,May
