Delete all variables in the current environment (if you have already run some cells) - clean state.

In [122]:
%reset

Import all necessary packages.

NOTE: Replace the download directory of the NLTK tokenizer files with your preferred directory.

In [124]:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import shutil
from datetime import datetime
from dateutil import parser

from sentistrength import PySentiStr
import re

import nltk
nltk.download('punkt', download_dir='/home/andreistoica12/research-internship')
from nltk.tokenize import word_tokenize
import nltk.data
# Load the punkt tokenizer data from the local directory
nltk.data.load('tokenizers/punkt/PY3/english.pickle')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreistoica12/research-internship...
[nltk_data]   Package punkt is already up-to-date!


<nltk.tokenize.punkt.PunktSentenceTokenizer at 0x7f05f89ec220>

Replace with the path to the root folder of the project.

In [125]:
rootdir_path = '/home/andreistoica12/research-internship'

Replace with the path to the folder where we store the dataset.

In [126]:
data_path = '/home/andreistoica12/research-internship/data/covaxxy-csv-complete'

Create 2 subfolders to store important files and graphs, respectively. If they already existed (from previous runnings of the project), delete the folders and their contents and create empty folders to store the current files and graphs, relevant to the current state of the project.

In [127]:
files_path = os.path.join(rootdir_path, 'files')
if os.path.exists(files_path):
   shutil.rmtree(files_path, ignore_errors=False, onerror=None)
os.makedirs(files_path)

graphs_path = os.path.join(rootdir_path, 'graphs')
if os.path.exists(graphs_path):
   shutil.rmtree(graphs_path, ignore_errors=False, onerror=None)
os.makedirs(graphs_path)

In [128]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')
if os.path.exists(covaxxy_graphs_path):
   shutil.rmtree(covaxxy_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_graphs_path)

In [129]:
covaxxy_longitudinal_analysis_graphs = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
if os.path.exists(covaxxy_longitudinal_analysis_graphs):
   shutil.rmtree(covaxxy_longitudinal_analysis_graphs, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_longitudinal_analysis_graphs)

In [130]:
def create_days(data_path):
    # A list of the current data files need for my analysis.
    file_list = os.listdir(data_path)

    # For simplicity's and consistency's sake, I will store all data in chronological order, 
    # so we sort the list of file names from the start.
    file_list.sort(key=lambda date: datetime.strptime(date, "tweet_ids--%Y-%m-%d.csv"))

    # I parse the date of the tweets from the file names and transform them into datetime objects. 
    # This makes it easier to get the day/month/year, as they are already properties of such type of objects.
    keys_datetime = [ datetime.strptime(key, "tweet_ids--%Y-%m-%d.csv") for key in file_list ]

    # Ultimately, I will store each .csv file as a pandas DataFrame in a dictionary, where the keys represent a 
    # simplified form of the date. So, here, I will format the dates from the datetime objects into simple strings.
    keys = [ f"{key.day}-{key.month}-{key.year}" for key in keys_datetime ]

    # In order to read the data from the files, I need the paths of the files to be passed on to the read_csv() function. 
    # The order of the days in the file paths needs to be consistent with the order of the dates in the keys
    paths = [ os.path.join(data_path, file) for file in file_list ]

    # Here, I will build the dictionary where the keys represent the formatted simple date and 
    # the values are dataframes corresponding to each file.
    days = dict()
    for i in range(len(file_list)):
        days[keys[i]] = pd.read_csv(paths[i], index_col=False)

    return days

We define a function remove_emoji() that takes a text string as input and uses a regular expression to match all Unicode characters that are classified as emojis. The regular expression includes different ranges of Unicode characters that represent different types of emojis, such as emoticons, symbols, and flags.

In [131]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [132]:
def remove_stopwords(text):
    # # I can use the predefined list of stopwords provided by NLTK, but it's for general purpose
    # # and the results when computing the sentiment are worse than expected, e.g. it considers
    # # words, such as "not" and "all" to be stopwords in contexts where they are actually important.
    
    # nltk.download('stopwords')
    # from nltk.corpus import stopwords
    # stop_words = set(stopwords.words('english'))


    # To avoid this problem, I defined a custom list of stopwords, which I made sure doesn't contain wrong stopwords.
    stop_words = {"the", "and", "or", "a", "an", "in", "of", "on", "to", "that", "this", "is", "are", 
                  "was", "were", "am", "be", "been", "has", "have", "had", "do", "does", "did", "will", "would", 
                  "should", "can", "could", "may", "might", "must", "shall", "shouldn't", "wouldn't", "couldn't", 
                  "can't", "mustn't", "haven't", "hasn't", "hadn't", "didn't", "doesn't", "don't"}

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [133]:
def clean_text(text):    
    # 1. Lowercase all words in the text
    text = text.lower()

    # Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 2. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 3. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 4. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    import contractions
    # 5. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 6. Remove emojis
    text = remove_emoji(text)

    # 7. Remove stopwords in English
    text = remove_stopwords(text)

    return text

In [134]:
days = create_days(data_path)

In [135]:
# Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
# I did that because some replies to a tweet posted today can come some days after, so we need to take care
# of the dataset as a whole.
merged_days = pd.concat([df for key, df in days.items()])

In [136]:
merged_days

Unnamed: 0,id,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,0,2021-03-01T00:01:56.000Z,1366176845561962503,1,14914686,@UK_Centrist @_PhB @RolandBakerIII @RicardLope...,#,☣️Cassandra☣️,EllieJellie2020,False,...,#,#,#,22389018,UK_Centrist,22389018,1366173564957843458,#,replied_to,1366173564957843458
1,1,2021-03-01T00:01:57.000Z,1366176846895738883,1,2402490445,"RT @THE_Russell: Berijiklian: ""There may be a ...",#,kesokoji,kesokoji,False,...,53751609,1366156265877954565,THE_Russell,53751609,THE_Russell,#,#,#,retweeted,1366156265877954565
2,2,2021-03-01T00:01:57.000Z,1366176847822811145,1,56147198,RT @YvetteCooperMP: Cases of the Brazil varian...,#,Rosemary #FBPA,rosepoet,False,...,328634628,1366118307435319296,YvetteCooperMP,328634628,YvetteCooperMP,#,#,#,retweeted,1366118307435319296
3,3,2021-03-01T00:01:57.000Z,1366176848225464323,1,1252300308165857280,RT @OfficialKat: Cannot wait for the vaccine. ...,#,Elektra Murdock 🔴 #WeSavedDaredevil,PattElektra,False,...,23544268,1366072756828131328,OfficialKat,23544268,OfficialKat,#,#,#,retweeted,1366072756828131328
4,4,2021-03-01T00:01:57.000Z,1366176848284057600,1,190474968,New vaccination appointments available tomorro...,"twitter.com,",Cameron Polom ABC,cpolom,True,...,#,#,#,9721292,abc15,#,#,#,quoted,1366176363103858701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541258,63,2021-03-10T03:58:32.000Z,1369497876213067777,1,552510895,RT @abcadelaide: The first COVID-19 vaccine ha...,#,Brad Coates,Bandit2809,False,...,16213139,1369444755751469057,abcadelaide,16213139,abcadelaide,#,#,#,retweeted,1369444755751469057
541259,64,2021-03-10T03:58:32.000Z,1369497877119197186,1,1173226982999580672,RT @KizzyPhD: Thanks for making the informed c...,#,Edward Patrick Vogel,MathArt4All,False,...,1215444012322172928,1369482754845466629,KizzyPhD,1215444012322172928,KizzyPhD,#,#,#,retweeted,1369482754845466629
541260,65,2021-03-10T03:58:32.000Z,1369497877349797894,1,1031738844230680576,RT @Ross_Greer: You live in a country where a ...,#,XoZXo,XoZXo2,False,...,50308678,1369409097213296652,Ross_Greer,50308678,Ross_Greer,#,#,#,retweeted,1369409097213296652
541261,66,2021-03-10T03:58:32.000Z,1369497877827977216,1,1291864731779837953,My mom got the vaccine...,#,October 5th 😌🇭🇹,Vie_de_martini,False,...,#,#,#,#,#,#,#,#,#,#


In [137]:
merged_days.columns

Index(['id', 'created_at', 'tweet_id', 'credible', 'author_id', 'text', 'urls',
       'name', 'username', 'verified', 'location', 'followers_count',
       'following_count', 'tweet_count', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'retweet_author_id', 'retweet_id',
       'retweeted_screen_name', 'user_mentions_id',
       'user_mentions_screen_name', 'in_reply_to_user_id',
       'in_reply_to_tweet_id', 'in_reply_to_username', 'reference_type',
       'reference_id'],
      dtype='object')

In [138]:
# merged_days.insert(loc=merged_days.columns.get_loc('in_reply_to_tweet_id') + 1, column='no_of_replies_to_tweet', value=0)

In [139]:
replies = merged_days[merged_days['reference_type'] == 'replied_to'].copy()

In [140]:
replies

Unnamed: 0,id,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,0,2021-03-01T00:01:56.000Z,1366176845561962503,1,14914686,@UK_Centrist @_PhB @RolandBakerIII @RicardLope...,#,☣️Cassandra☣️,EllieJellie2020,False,...,#,#,#,22389018,UK_Centrist,22389018,1366173564957843458,#,replied_to,1366173564957843458
8,8,2021-03-01T00:01:57.000Z,1366176849412444161,1,1201995227285864449,@devisridhar According to this 1 in 20 cases i...,"www.msn.com,",Dr Project Reality 💙,DrProjectReali1,False,...,#,#,#,174151902,devisridhar,174151902,1366076047150747649,devisridhar,replied_to,1366076047150747649
16,16,2021-03-01T00:01:58.000Z,1366176853187256325,1,1252565736431681539,"@darrenmark69 No, you're not alone. Unbelievab...",#,USS Liberty 💜,KatePatten19,False,...,#,#,#,65384825,darrenmark69,65384825,1366077505099202560,darrenmark69,replied_to,1366077505099202560
23,23,2021-03-01T00:02:00.000Z,1366176860267294721,1,251328613,@Scaramucci @cardsharkgal Didn’t ya hear? HE c...,#,Mike Nolan 🇺🇸☘️,mnolan49,False,...,#,#,#,24578794,Scaramucci,24578794,1366171066230595589,Scaramucci,replied_to,1366171066230595589
27,27,2021-03-01T00:02:00.000Z,1366176861554794500,1,1032120394877087744,@king_reinhardt Heck if I wrote up on some web...,#,Meike is taking a MH break 💜,MeikeTorkelson,False,...,#,#,#,1222237807596441601,king_reinhardt,1032120394877087744,1366176224863608837,MeikeTorkelson,replied_to,1366176224863608837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541194,57,2021-03-10T03:58:20.000Z,1369497827701895168,1,1266726937432637440,@Tinafierce1 Hahaha haha but honestly what wou...,#,ABUELO(grandpa) RAY,RayganRonald,False,...,#,#,#,708227052097441793,Tinafierce1,708227052097441793,1369495305905963011,Tinafierce1,replied_to,1369495305905963011
541199,4,2021-03-10T03:58:22.000Z,1369497834752446464,1,14240070,@KStarorzewski I want unhinged-Leslie-Knope-st...,"twitter.com,",Maia Nolan-Partnow,myster,False,...,#,#,#,800073,KStarorzewski,800073,1369495710240960519,KStarorzewski,replied_to,1369495710240960519
541222,27,2021-03-10T03:58:26.000Z,1369497850724290570,1,1369113112549289986,@Tednoir1 @BBCWorld use Chinese vaccine,#,bigerboss2021,bigerboss2021,False,...,#,#,#,742143,BBCWorld,1255745422020284417,1369467147445932033,#,replied_to,1369467147445932033
541237,42,2021-03-10T03:58:29.000Z,1369497862904688646,1,1345985245506580480,@taiandsashin @tiredicunurse @NightShiftMD Wow...,#,Gibson Girl on Acid,zoon_mappa,False,...,#,#,#,1312169449244553222,taiandsashin,1312169449244553222,1369474901380325382,taiandsashin,replied_to,1369474901380325382


In [141]:
# # count the occurrences of each combination of values in author_id and tweet_id
# counts = replies.groupby(['author_id', 'in_reply_to_tweet_id'])['no_of_replies_to_tweet'].transform('count')
# multiple_replies = replies[counts > 1].copy()

In [158]:
multiple_replies = replies[replies.duplicated(subset=['author_id', 'in_reply_to_tweet_id'], keep=False)].copy()

In [159]:
multiple_replies

Unnamed: 0,id,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
540,0,2021-03-01T03:28:20.000Z,1366228784953786369,1,92928883,@RossAndRussel @3AW693 Are you aware that Pfiz...,"pfizer.com,twitter.com,twitter.com,twitter.com,",Elizabeth,welosthim,False,...,#,#,#,118499648,RossAndRussel,118499648,1366130800098308099,RossAndRussel,replied_to,1366130800098308099
1054,52,2021-03-01T03:29:20.000Z,1366229038285602818,1,1263980583027445761,@OHdeptofhealth Again I will ask how many peop...,#,Italianborn2,Italianborn2,False,...,#,#,#,90422822,OHdeptofhealth,90422822,1365313864888446983,OHdeptofhealth,replied_to,1365313864888446983
1063,61,2021-03-01T03:29:21.000Z,1366229041808740352,1,881874973027033093,@citizennacho No nation wide vaccination with ...,#,joeytrenas77,joeytrenas77,False,...,#,#,#,#,#,151333070,1366227671475130373,#,replied_to,1366227671475130373
1494,28,2021-03-01T03:30:15.000Z,1366229267521019904,1,92928883,@RossAndRussel “A 19-year-old was hospitalized...,"www.jpost.com,",Elizabeth,welosthim,False,...,#,#,#,118499648,RossAndRussel,118499648,1366130800098308099,RossAndRussel,replied_to,1366130800098308099
1592,55,2021-03-01T03:30:31.000Z,1366229335200452611,1,1052428997609566210,@redheadmom8 @Ragusaecomites @scrowder None of...,#,AltPan,altdrpan,False,...,#,#,#,2317141258,redheadmom8,2317141258,1366170829042683905,redheadmom8,replied_to,1366170829042683905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536813,14,2021-03-10T03:44:58.000Z,1369494463928745984,1,1369443700154236929,@Surgeon_General You guys take tge vaccine fir...,#,Rebecca Williams,Rebecca75876544,False,...,#,#,#,455024343,Surgeon_General,455024343,1369364110949847040,Surgeon_General,replied_to,1369364110949847040
538909,37,2021-03-10T03:51:17.000Z,1369496050642259969,1,1296776924656578564,@Ranjna_P @IntersticeNomad @ShivAroor @IndiaTo...,#,Akhil Mahajan,AkhilMa45664085,False,...,#,#,#,854584832990388224,Ranjna_P,854584832990388224,1369494676542267396,Ranjna_P,replied_to,1369494676542267396
539686,22,2021-03-10T03:53:37.000Z,1369496639459721218,1,1327450312152657921,@GovParsonMO It’s About the Urban areas !!Pars...,#,Lonnie Grosman,LonnieGrosman,False,...,#,#,#,1003672320781807616,GovParsonMO,1003672320781807616,1368998092419203072,GovParsonMO,replied_to,1368998092419203072
540624,53,2021-03-10T03:56:17.000Z,1369497311332605952,1,1296776924656578564,@Ranjna_P @IntersticeNomad @ShivAroor @IndiaTo...,#,Akhil Mahajan,AkhilMa45664085,False,...,#,#,#,854584832990388224,Ranjna_P,854584832990388224,1369494676542267396,Ranjna_P,replied_to,1369494676542267396


In [160]:
# group the rows by the two columns
grouped_df = multiple_replies.groupby(['author_id', 'in_reply_to_tweet_id'])

In [161]:
groups = grouped_df.groups

In [162]:
senti = PySentiStr()

In [163]:
# Set the path to the SentiStrength executable file
senti.setSentiStrengthPath('/home/andreistoica12/research-internship/SentiStrength/SentiStrengthCom.jar')

In [164]:
senti.setSentiStrengthLanguageFolderPath('/home/andreistoica12/research-internship/SentiStrength/LanguageFolder')

In [168]:
opinion_changes = []
group_counter = 0
print_checkpoint = 0.01
for group, rows in groups.items():
    group_counter += 1
    if(group_counter / len(groups) >= print_checkpoint):
        print(f"Group: {group}. {group_counter}/{len(groups)}")
        print_checkpoint += 0.01
    # print(f"Group: {group}. {group_counter}/{len(groups)}")

    sentiments = []
    for index in rows:
        sentiments.append(senti.getSentiment(clean_text(str(replies.loc[index, 'text'])), score='scale'))
    
    arr = np.array(sentiments)

    positive = np.any(arr > 0)
    negative = np.any(arr < 0)

    opinion_change = positive and negative
    opinion_changes.append(opinion_change)

    # TODO: nice results, but WAY TOO SLOW!!!
    if group_counter >= 200:
        break


Group: (14911096, '1368113977872556032'). 56/5587
Group: (17016516, '1366416562228506626'). 112/5587
Group: (19108553, '1369451924098007050'). 168/5587


In [169]:
len(opinion_changes)

200

In [170]:
opinion_changes.count(np.bool_('True'))

21

In [27]:
text = clean_text(text)

NameError: name 'text' is not defined

In [368]:
result = senti.getSentiment(text, score='scale')

In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [96]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [97]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [24]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
