In [1]:
import pandas as pd
from langdetect import detect
import numpy as np
import datetime as dt
import os

input_csv = 'trudeau_raw.csv'
output_name = 'trudeau_clean.csv'

df = pd.read_csv('data//'+input_csv)  # read csv containing output of twitter scraper

num_tweets_read = df.shape[0] # number of tweets ingested
print(dt.datetime.today().strftime('%b-%d-%Y %H:%M:%S EST') + ' - ' + str(num_tweets_read) +
      ' tweets read from file \"' + input_csv + '\".')  # log message

df.head()

Jun-08-2020 22:31:15 EST - 2881 tweets read from file "trudeau_raw.csv".


Unnamed: 0,Tweets,Length,Date,Source,Favourites,RTs,Username,id_str
0,If you’ve been to Vancouver but never visited ...,297,2020-05-31 23:02:25,Twitter for iPhone,2498,345,JustinTrudeau,1267229979449966594
1,"Si vous êtes déjà allé à Vancouver, mais n’ave...",299,2020-05-31 23:02:17,Twitter for iPhone,286,39,JustinTrudeau,1267229946172379144
2,"RT @FilomenaTassi: Are you a student, aged 15-...",140,2020-05-31 22:43:03,Twitter for iPhone,0,114,JustinTrudeau,1267225104745992199
3,RT @FilomenaTassi: Vous êtes un étudiant de 15...,140,2020-05-31 22:43:00,Twitter for iPhone,0,22,JustinTrudeau,1267225093790449671
4,RT @BardishKW: While #AsianHeritageMonth may b...,139,2020-05-31 22:34:25,Twitter for iPhone,0,83,JustinTrudeau,1267222930758479872


In [2]:
df['isRT'] = df.copy()['Tweets'].str.startswith('RT')  # label RTs (true/false)

df = df.loc[df['isRT']==False].copy()  # keep only non-retweets

num_RTs = num_tweets_read - df.shape[0]  # (number of tweets read) - (number remaining after removing RTs)
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_RTs) + ' RTs removed.')

Jun-08-2020 22:31:15 EST - 1250 RTs removed.


In [3]:
# use langdetect package to create column specifying language
df['Language'] = np.array([detect(t) for t in df['Tweets']])
print(df['Language'].value_counts())
num_french_tweets = df.loc[(df['Language'] == 'fr')].shape[0]  # number of french tweets detected
# keep non-french tweets (any other language classification is almost certainly incorrect)
df = df.loc[(df['Language'] != 'fr')].copy()
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_french_tweets) + ' French tweets removed.')

en    825
fr    805
nl      1
Name: Language, dtype: int64
Jun-08-2020 22:31:25 EST - 805 French tweets removed.


In [4]:
df['Date'] = pd.to_datetime(df['Date'])  # set date type
df['Month'] = df['Date'].dt.month_name()  # create column month name from date

In [5]:
if os.path.exists('data//' + output_name):  # if there is existing data
    main_data = pd.read_csv('data//' + output_name, encoding='utf-8')  # read csv of existing data
    num_rows_main = main_data.shape[0]  # number of rows in the main dataset before adding to it
    updated_df = pd.concat([main_data, df], sort=False)  # update data by appending new tweets
    updated_df.reset_index(drop=True)
else:
    num_rows_main = 0
    print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - Adding data for the first time to data/" + output_name)
    num_new_rows = df.shape[0]
    updated_df = df

temp_num_rows = updated_df.shape[0]  # to calculate how many rows we drop in the next line
updated_df.drop_duplicates(subset = ['id_str'], keep = 'first', inplace=True)  # drop duplicates in case old tweets were downloaded mistakenly
num_rows_updated = updated_df.shape[0]  # number of rows in the updated main dataset
num_rows_dropped = temp_num_rows - num_rows_updated
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_rows_dropped) + " rows dropped.")
num_new_rows = num_rows_updated - num_rows_main  # number of rows added to the main dataset
    
print(dt.datetime.today().strftime("%b-%d-%Y %H:%M:%S EST") + " - " + str(num_new_rows) + " new rows saved to " + output_name)
updated_df.to_csv('data//' + output_name, encoding='utf-8', index=False)  # save updated data as csv

if os.path.exists('data//' + input_csv):
    source_path = os.path.join('data', input_csv)
    target_path = os.path.join('data','old', input_csv)
    os.rename(source_path, target_path)  # move the input data file since we don't need it anymore

df.head()

Jun-08-2020 22:31:25 EST - Adding data for the first time to data/trudeau_clean.csv
Jun-08-2020 22:31:25 EST - 0 rows dropped.
Jun-08-2020 22:31:25 EST - 826 new rows saved to trudeau_clean.csv


Unnamed: 0,Tweets,Length,Date,Source,Favourites,RTs,Username,id_str,isRT,Language,Month
0,If you’ve been to Vancouver but never visited ...,297,2020-05-31 23:02:25,Twitter for iPhone,2498,345,JustinTrudeau,1267229979449966594,False,en,May
6,"For decades, Michel Gauthier was a passionate ...",219,2020-05-31 14:46:23,Twitter for iPhone,1234,126,JustinTrudeau,1267105145638268929,False,en,May
8,Thank you. To those who are working on the fro...,288,2020-05-30 20:02:44,Twitter for iPhone,2140,283,JustinTrudeau,1266822370527936512,False,en,May
12,An estimated 1 in 385 Canadians live with MS. ...,298,2020-05-30 14:35:36,Twitter for iPhone,1281,313,JustinTrudeau,1266740044817203206,False,en,May
18,For more information on the new supports we an...,185,2020-05-29 17:08:31,Twitter for iPhone,517,81,JustinTrudeau,1266416139476893701,False,en,May
