In [8]:
import pandas as pd
import numpy as np
import datetime as dt

input_trump = 'trumparchive_may_orig.csv'  # name of input file found in 'data' folder
output_trump = 'trump_clean.csv'  # may already exist, this is the main dataset which we will add to

df = pd.read_csv('data//'+input_trump, encoding='utf-8')  # read csv containing data from trump twitter archive

num_tweets_read = df.shape[0] # number of tweets ingested
print(dt.datetime.today().strftime('%b-%d-%Y %H:%M:%S EST') + ' - ' + str(num_tweets_read) +
      ' tweets read from file \"' + input_trump + '\".')  # log message
df.head()

Jun-01-2020 22:34:20 EST - 1089 tweets read from file "trumparchive_may_orig.csv".


Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,RT @realDonaldTrump: The National Guard has be...,05-31-2020 02:56:21,62393,0,True,1266926462944321538
1,Twitter for iPhone,RT @Mike_Pence: Today @SecondLady and I had th...,05-31-2020 02:27:41,11267,0,True,1266919246426189830
2,Twitter for iPhone,RT @ericbolling: This will likely be seen as @...,05-31-2020 02:27:36,7928,0,True,1266919225974759427
3,Twitter for iPhone,RT @TeamTrump: President @realDonaldTrump: “Am...,05-31-2020 02:27:36,6793,0,True,1266919225974820864
4,Twitter for iPhone,RT @TeamTrump: President @realDonaldTrump: “In...,05-31-2020 02:27:36,7579,0,True,1266919225983197185


In [9]:
# rename columns to fit our schema
df.rename(columns = {'source':'Source', 'text':'Tweets', 'created_at':'Date', 'retweet_count':'RTs',
                           'is_retweet':'isRT', 'favorite_count':'Favourites'}, inplace = True)

num_RTs_orig = df['isRT'][df['isRT']==True].count()
df['isRT'] = df['Tweets'].apply(func = lambda t: t.startswith('RT'))
num_RTs_added = df['isRT'][df['isRT']==True].count() - num_RTs_orig

print(dt.datetime.today().strftime('%b-%d-%Y %H:%M:%S EST') + ' - ' + str(num_RTs_added) + ' additional RTs flagged.')

Jun-01-2020 22:34:22 EST - 283 additional RTs flagged.


In [11]:
import os

df['Date'] = pd.to_datetime(df['Date'])  # format date properly
df['Month'] = df['Date'].dt.strftime('%B')  # create month column for trump tweets

df = df.loc[df['isRT']==False].copy()  # remove retweets
num_RTs = num_tweets_read - df.shape[0]  # (number of tweets read) - (number remaining after removing RTs)
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_RTs) + ' RTs removed.')

if os.path.exists('data//' + output_trump):  # if there is existing data
    main_data = pd.read_csv('data//' + output_trump, encoding='utf-8')  # read csv of existing data
    num_rows_main = main_data.shape[0]  # number of rows in the main dataset before adding to it
    updated_df = pd.concat([main_data, df])  # update data by appending new tweets
    updated_df.reset_index(drop=True)
else:
    num_rows_main = 0
    print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - Adding data for the first time to data/" + output_trump)
    num_new_rows = df.shape[0]
    updated_df = df

temp_num_rows = updated_df.shape[0]  # to calculate how many rows we drop in the next line
updated_df.drop_duplicates(subset = ['id_str'], keep = 'first', inplace=True)  # drop duplicates in case old tweets were downloaded mistakenly
num_rows_updated = updated_df.shape[0]  # number of rows in the updated main dataset
num_rows_dropped = temp_num_rows - num_rows_updated
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_rows_dropped) + " rows dropped.")
num_new_rows = num_rows_updated - num_rows_main  # number of rows added to the main dataset
    
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_new_rows) + " new rows saved to " + output_trump)
updated_df.to_csv('data//trump_clean.csv', encoding='utf-8', index=False)  # save updated data as csv

if os.path.exists('data//' + input_trump):
    source_path = os.path.join('data', input_trump)
    target_path = os.path.join('data','old', input_trump)
    os.rename(source_path, target_path)  # move the input data file since we don't need it anymore

df.head()

Jun-01-2020 22:34:29 EST - 539 RTs removed.
Jun-01-2020 22:34:29 EST - 550 rows dropped.
Jun-01-2020 22:34:29 EST - 0 new rows saved to trump_clean.csv


Unnamed: 0,Source,Tweets,Date,RTs,Favourites,isRT,id_str,Month
9,Twitter for iPhone,Let New York’s Finest be New York’s Finest. Th...,2020-05-31 02:12:14,33863,202862,False,1266915358914621440,May
10,Twitter for iPhone,The National Guard has been released in Minnea...,2020-05-31 02:08:42,62393,293293,False,1266914470066036736,May
11,Twitter for iPhone,https://t.co/qD52HYuvpH,2020-05-31 00:54:36,27811,87057,False,1266895821083234304,May
12,Twitter for iPhone,MN Gov. Walz: We Estimate 80% of the Rioters A...,2020-05-31 00:20:50,20742,59221,False,1266887324505382912,May
19,Twitter for iPhone,Hopefully a great successful and safe ROCKET L...,2020-05-30 18:37:07,19300,143675,False,1266800827621990400,May
