Delete all variables in the current environment (if you have already run some cells) - clean state.

In [1]:
%reset

Import all necessary packages.

NOTE: Replace the download directory of the NLTK tokenizer files with your preferred directory (I chose the root directory of the Research Internship project)

In [2]:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import shutil
from datetime import datetime
from dateutil import parser

from sentistrength import PySentiStr
import re

import nltk
nltk.download('punkt', download_dir='/home/andreistoica12/research-internship')
from nltk.tokenize import word_tokenize
import nltk.data
# Load the punkt tokenizer data from the local directory
nltk.data.load('tokenizers/punkt/PY3/english.pickle')

import json

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreistoica12/research-internship...
[nltk_data]   Package punkt is already up-to-date!


Replace with the path to the root folder of the project.

In [3]:
rootdir_path = '/home/andreistoica12/research-internship'

Replace with the path to the folder where we store the dataset.

In [4]:
data_path = '/home/andreistoica12/research-internship/data/covaxxy-csv-complete'

In [5]:
path_to_sentistrength_jar = '/home/andreistoica12/research-internship/SentiStrength/SentiStrengthCom.jar'

In [6]:
path_to_sentistrength_language_folder = '/home/andreistoica12/research-internship/SentiStrength/LanguageFolder'

Create 2 subfolders to store important files and graphs, respectively. If they already existed (from previous runnings of the project), delete the folders and their contents and create empty folders to store the current files and graphs, relevant to the current state of the project.

In [7]:
files_path = os.path.join(rootdir_path, 'files')
if os.path.exists(files_path):
   shutil.rmtree(files_path, ignore_errors=False, onerror=None)
os.makedirs(files_path)

graphs_path = os.path.join(rootdir_path, 'graphs')
if os.path.exists(graphs_path):
   shutil.rmtree(graphs_path, ignore_errors=False, onerror=None)
os.makedirs(graphs_path)

In [8]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')
if os.path.exists(covaxxy_graphs_path):
   shutil.rmtree(covaxxy_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_graphs_path)

In [9]:
covaxxy_longitudinal_analysis_graphs = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
if os.path.exists(covaxxy_longitudinal_analysis_graphs):
   shutil.rmtree(covaxxy_longitudinal_analysis_graphs, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_longitudinal_analysis_graphs)

In [10]:
def create_days(data_path):
    # A list of the current data files need for my analysis.
    file_list = os.listdir(data_path)

    # For simplicity's and consistency's sake, I will store all data in chronological order, 
    # so we sort the list of file names from the start.
    file_list.sort(key=lambda date: datetime.strptime(date, "tweet_ids--%Y-%m-%d.csv"))

    # I parse the date of the tweets from the file names and transform them into datetime objects. 
    # This makes it easier to get the day/month/year, as they are already properties of such type of objects.
    keys_datetime = [ datetime.strptime(key, "tweet_ids--%Y-%m-%d.csv") for key in file_list ]

    # Ultimately, I will store each .csv file as a pandas DataFrame in a dictionary, where the keys represent a 
    # simplified form of the date. So, here, I will format the dates from the datetime objects into simple strings.
    keys = [ f"{key.day}-{key.month}-{key.year}" for key in keys_datetime ]

    # In order to read the data from the files, I need the paths of the files to be passed on to the read_csv() function. 
    # The order of the days in the file paths needs to be consistent with the order of the dates in the keys
    paths = [ os.path.join(data_path, file) for file in file_list ]

    # Here, I will build the dictionary where the keys represent the formatted simple date and 
    # the values are dataframes corresponding to each file.
    days = dict()
    for i in range(len(file_list)):
        days[keys[i]] = pd.read_csv(paths[i], index_col=False)
        days[keys[i]].drop('id', axis=1, inplace=True)

    return days

We define a function remove_emoji() that takes a text string as input and uses a regular expression to match all Unicode characters that are classified as emojis. The regular expression includes different ranges of Unicode characters that represent different types of emojis, such as emoticons, symbols, and flags.

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [12]:
def remove_stopwords(text):
    # # I can use the predefined list of stopwords provided by NLTK, but it's for general purpose
    # # and the results when computing the sentiment are worse than expected, e.g. it considers
    # # words, such as "not" and "all" to be stopwords in contexts where they are actually important.
    
    # nltk.download('stopwords')
    # from nltk.corpus import stopwords
    # stop_words = set(stopwords.words('english'))


    # To avoid this problem, I defined a custom list of stopwords, which I made sure doesn't contain wrong stopwords.
    stop_words = {"the", "and", "or", "a", "an", "in", "of", "on", "to", "that", "this", "is", "are", 
                  "was", "were", "am", "be", "been", "has", "have", "had", "do", "does", "did", "will", "would", 
                  "should", "can", "could", "may", "might", "must", "shall", "shouldn't", "wouldn't", "couldn't", 
                  "can't", "mustn't", "haven't", "hasn't", "hadn't", "didn't", "doesn't", "don't"}

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [13]:
def clean_text(text):    
    # 1. Lowercase all words in the text
    text = text.lower()

    # Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 2. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 3. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 4. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    import contractions
    # 5. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 6. Remove emojis
    text = remove_emoji(text)

    # 7. Remove stopwords in English
    text = remove_stopwords(text)

    return text

In [15]:
def sentiment_analysis(data_path, path_to_sentistrength_jar, path_to_sentistrength_language_folder):

    days = create_days(data_path)
    
    # Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
    # I did that because some replies to a tweet posted today can come some days after, so we need to take care
    # of the dataset as a whole.

    # concatenate the dataframes and reset the index
    merged_days = pd.concat([df for key, df in days.items()], ignore_index=True)

    # Convert string column to datetime
    merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])
  
    # Sort dataframe based on datetime column
    # NOTE: I also reset the index so that I know later on which tweet was posted first based on the index (useful for opinion change)
    merged_days = merged_days.sort_values('created_at').reset_index(drop=True)

    # TODO: CONTINUE WHEN EVERYTHIN IS DONE

    


In [14]:
days = create_days(data_path)

In [15]:
# Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
# I did that because some replies to a tweet posted today can come some days after, so we need to take care
# of the dataset as a whole.

# concatenate the dataframes and reset the index
merged_days = pd.concat([df for key, df in days.items()], ignore_index=True)

In [16]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-03-01T00:01:56.000Z,1366176845561962503,1,14914686,@UK_Centrist @_PhB @RolandBakerIII @RicardLope...,#,☣️Cassandra☣️,EllieJellie2020,False,#,...,#,#,#,22389018,UK_Centrist,22389018,1366173564957843458,#,replied_to,1366173564957843458
1,2021-03-01T00:01:57.000Z,1366176846895738883,1,2402490445,"RT @THE_Russell: Berijiklian: ""There may be a ...",#,kesokoji,kesokoji,False,#,...,53751609,1366156265877954565,THE_Russell,53751609,THE_Russell,#,#,#,retweeted,1366156265877954565
2,2021-03-01T00:01:57.000Z,1366176847822811145,1,56147198,RT @YvetteCooperMP: Cases of the Brazil varian...,#,Rosemary #FBPA,rosepoet,False,Brighton,...,328634628,1366118307435319296,YvetteCooperMP,328634628,YvetteCooperMP,#,#,#,retweeted,1366118307435319296
3,2021-03-01T00:01:57.000Z,1366176848225464323,1,1252300308165857280,RT @OfficialKat: Cannot wait for the vaccine. ...,#,Elektra Murdock 🔴 #WeSavedDaredevil,PattElektra,False,#,...,23544268,1366072756828131328,OfficialKat,23544268,OfficialKat,#,#,#,retweeted,1366072756828131328
4,2021-03-01T00:01:57.000Z,1366176848284057600,1,190474968,New vaccination appointments available tomorro...,"twitter.com,",Cameron Polom ABC,cpolom,True,#,...,#,#,#,9721292,abc15,#,#,#,quoted,1366176363103858701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123691,2021-03-10T03:58:32.000Z,1369497876213067777,1,552510895,RT @abcadelaide: The first COVID-19 vaccine ha...,#,Brad Coates,Bandit2809,False,#,...,16213139,1369444755751469057,abcadelaide,16213139,abcadelaide,#,#,#,retweeted,1369444755751469057
5123692,2021-03-10T03:58:32.000Z,1369497877119197186,1,1173226982999580672,RT @KizzyPhD: Thanks for making the informed c...,#,Edward Patrick Vogel,MathArt4All,False,55419,...,1215444012322172928,1369482754845466629,KizzyPhD,1215444012322172928,KizzyPhD,#,#,#,retweeted,1369482754845466629
5123693,2021-03-10T03:58:32.000Z,1369497877349797894,1,1031738844230680576,RT @Ross_Greer: You live in a country where a ...,#,XoZXo,XoZXo2,False,#,...,50308678,1369409097213296652,Ross_Greer,50308678,Ross_Greer,#,#,#,retweeted,1369409097213296652
5123694,2021-03-10T03:58:32.000Z,1369497877827977216,1,1291864731779837953,My mom got the vaccine...,#,October 5th 😌🇭🇹,Vie_de_martini,False,#,...,#,#,#,#,#,#,#,#,#,#


In [17]:
type(merged_days['created_at'][0])

str

In [18]:
# Convert string column to datetime
merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])

In [19]:
# Sort dataframe based on datetime column
# NOTE: I also reset the index so that I know later on which tweet was posted first based on the index (useful for opinion change)
merged_days = merged_days.sort_values('created_at').reset_index(drop=True)

In [20]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
1,2021-02-24 18:00:18+00:00,1364636282664574978,1,26761523,Ready for DAY 2 of State of the Valley? Join u...,"jointventure.org,twitter.com,",Joint Venture SV,JointVentureSVN,False,"San Jose, CA",...,#,#,#,#,#,#,#,#,#,#
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
3,2021-02-24 18:03:16+00:00,1364637028948709377,1,1329106574082641920,"#SD37: Starting next week, @OCHealth will star...","bit.ly,www.ocregister.com,",Senator Dave Min,SenDaveMin,True,"Irvine, CA",...,#,#,#,36069538,ochealth,#,#,#,#,#
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123691,2021-03-10 23:59:52+00:00,1369800203939745796,1,434360613,RT @Philo: The boys of #SouthPark are at it ag...,#,ami_,ami_tvdfan,False,#,...,81766872,1369799981763162113,philoTV,23827692,ComedyCentral,#,#,#,retweeted,1369799981763162113
5123692,2021-03-10 23:59:52+00:00,1369800204094963712,1,3083078947,RT @ericswalwell: The #AmericanRescuePlan puts...,#,Thomas Albrecht 🇺🇸☮️,TomAlb88,False,#,...,377609596,1369727803768201218,ericswalwell,377609596,ericswalwell,#,#,#,retweeted,1369727803768201218
5123693,2021-03-10 23:59:53+00:00,1369800205592363018,1,3538956135,"RT @Doc_Wolverine: ""Gee Doc, why are you pisse...",#,Jackaxed,Jackaxed,False,United States,...,898321581444911108,1369771222481985543,Doc_Wolverine,898321581444911108,Doc_Wolverine,#,#,#,retweeted,1369771222481985543
5123694,2021-03-10 23:59:53+00:00,1369800204761899011,1,29801287,"RT @TheDweck: Wow, vision boards work",#,Fauxnly Fans,thenickkontz,False,"ÜT: 43.508306,-96.779489",...,98247788,1369742802590990336,TheDweck,98247788,TheDweck,#,#,#,retweeted,1369742802590990336


In [21]:
replies = merged_days[merged_days['reference_type'] == 'replied_to'].copy()

In [22]:
replies

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
9,2021-02-24 18:06:06+00:00,1364637745302306825,1,178048979,He's never held publicly accountable for being...,#,B Randon,onestrikeaway,False,"Dallas, TX",...,#,#,#,#,#,178048979,1364637743951736834,onestrikeaway,replied_to,1364637743951736834
10,2021-02-24 18:06:33+00:00,1364637855125938182,1,65566588,@HandmaidAlberta @AHS_media Don’t be jealous b...,"twitter.com,",Barbara Larochelle,BarbLarochelle,False,#,...,#,#,#,926811657359077376,HandmaidAlberta,926811657359077376,1364598750816006147,HandmaidAlberta,replied_to,1364598750816006147
24,2021-02-24 18:19:25+00:00,1364641093447413762,1,409646935,@DrTedros @gavi @CEPIvaccines @UNICEF There ha...,#,Hope Torres,Evysdove,False,#,...,#,#,#,189868631,DrTedros,189868631,1364503901332979712,DrTedros,replied_to,1364503901332979712
33,2021-02-24 18:34:57+00:00,1364645005722226692,1,49764028,@jvipondmd @CMOH_Alberta Don't want to answer ...,#,Philip Turnbull,PhilipTurnbull,False,"Calgary, Alberta",...,#,#,#,4043106914,jvipondmd,4043106914,1364644663848669185,jvipondmd,replied_to,1364644663848669185
41,2021-02-24 18:40:46+00:00,1364646468099874817,1,1346178611405983745,@jkenney @AlbertaBoot Hope they are sh*tkicker...,"twitter.com,",Bernie Kuzzy,BernieKuzzy,False,#,...,#,#,#,21525682,jkenney,21525682,1364644332611899393,jkenney,replied_to,1364644332611899393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123625,2021-03-10 23:59:43+00:00,1369800166069448706,1,1177353170378461184,@iammickyjones @CandiceBenbow Yeah that makes ...,#,Easy Fit Lives,EasyFitLives,False,#,...,#,#,#,152149495,iammickyjones,152149495,1369798166145626116,iammickyjones,replied_to,1369798166145626116
5123633,2021-03-10 23:59:44+00:00,1369800169643008003,1,1095347460145324032,@RichMuny @hilaryp60091407 @GovMikeDeWine Firs...,#,CryptoBond007,bond007_crypto,False,#,...,#,#,#,20570337,RichMuny,20570337,1369799734534225925,RichMuny,replied_to,1369799734534225925
5123644,2021-03-10 23:59:45+00:00,1369800173333909506,1,871162855898349568,@TheCoronaCure_ The vaccinated can pass on the...,#,Louise,Louisecanr,False,Costa Rica,...,#,#,#,1303797127651393547,TheCoronaCure_,1303797127651393547,1369787277652688903,TheCoronaCure_,replied_to,1369787277652688903
5123674,2021-03-10 23:59:49+00:00,1369800190819926016,1,1363146664429543431,@CalamityxVanity MESSENGER RNA?! So they’re se...,#,ß† 🌙,itsbritttbish,False,"Arizona, USA",...,#,#,#,#,#,1364946631527829504,1369798105802043393,#,replied_to,1369798105802043393


In [23]:
replies.columns

Index(['created_at', 'tweet_id', 'credible', 'author_id', 'text', 'urls',
       'name', 'username', 'verified', 'location', 'followers_count',
       'following_count', 'tweet_count', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'retweet_author_id', 'retweet_id',
       'retweeted_screen_name', 'user_mentions_id',
       'user_mentions_screen_name', 'in_reply_to_user_id',
       'in_reply_to_tweet_id', 'in_reply_to_username', 'reference_type',
       'reference_id'],
      dtype='object')

In [24]:
multiple_replies = replies[replies.duplicated(subset=['author_id', 'in_reply_to_tweet_id'], keep=False)].copy()

In [25]:
multiple_replies_first_300 = multiple_replies.head(300).copy()

In [26]:
multiple_replies_first_300['in_reply_to_tweet_id'] = multiple_replies_first_300['in_reply_to_tweet_id'].astype(int)

In [27]:
multiple_replies_first_300

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
4433,2021-03-01 00:01:13+00:00,1366176662937862145,1,807410845026283520,@SandySue1958 You do NOT want shingles.\nHusba...,#,Real Bird🐦Lover❤ of the US,realBLOTUS,False,United States,...,#,#,#,505236411,SandySue1958,505236411,1366028474796945414,SandySue1958,replied_to,1366028474796945414
4891,2021-03-01 00:02:45+00:00,1366177051154264065,1,156254884,@BorisJohnson 🤣🤣🤣🤣😆of course they have 🤣🤣🤣🤣 t...,#,Helen mitchell🌸✝️,Helenmuk,False,Uk,...,#,#,#,3131144855,BorisJohnson,3131144855,1366041899421863937,BorisJohnson,replied_to,1366041899421863937
5346,2021-03-01 00:04:10+00:00,1366177405975465986,1,4450942574,@AaronBlake Moron. Trump is responsible for th...,#,DemInNameOnly,DemInNameOnly,False,#,...,#,#,#,136300373,AaronBlake,136300373,1366175228741767170,AaronBlake,replied_to,1366175228741767170
5455,2021-03-01 00:04:33+00:00,1366177500850761728,1,1365663369668288519,@IDPH THIS VACCINE IS GOING TO HURT MORE PEOPL...,#,MASKS R WORTHLESS,r_masks,False,#,...,#,#,#,71652085,IDPH,71652085,1366163065767669763,IDPH,replied_to,1366163065767669763
6032,2021-03-01 00:06:40+00:00,1366178034240409600,1,807410845026283520,@SandySue1958 Get the two-shot 💉💉 Shingrix. A...,#,Real Bird🐦Lover❤ of the US,realBLOTUS,False,United States,...,#,#,#,505236411,SandySue1958,505236411,1366028474796945414,SandySue1958,replied_to,1366028474796945414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178397,2021-03-01 07:40:21+00:00,1366292207829987328,1,1213928023092711424,@deanotheman @keithbutler01 @cheesedoff3 Funny...,#,KittyKat ❤️ 💙,jeds0808,False,#,...,#,#,#,68946743,deanotheman,68946743,1366187484988735490,deanotheman,replied_to,1366187484988735490
178420,2021-03-01 07:40:25+00:00,1366292224745607168,1,96094107,@Keir_Starmer Oh! And by the way. Well done to...,#,David Woodley,twitdwood,False,Southampton UK,...,#,#,#,2425571623,Keir_Starmer,2425571623,1366056212622700544,Keir_Starmer,replied_to,1366056212622700544
179467,2021-03-01 07:43:15+00:00,1366292938716717057,1,1337219846187061249,@ANI @RahulGandhi pls go ahead your party floo...,#,Indian 🇮🇳,shishya1112,False,#,...,#,#,#,355989081,ANI,355989081,1366285726057254912,ANI,replied_to,1366285726057254912
179741,2021-03-01 07:44:05+00:00,1366293148972969986,1,1337219846187061249,@ANI @kharge ji you mean Sonia G will not take...,#,Indian 🇮🇳,shishya1112,False,#,...,#,#,#,355989081,ANI,355989081,1366285726057254912,ANI,replied_to,1366285726057254912


In [28]:
multiple_replies_first_300[(multiple_replies_first_300['author_id'] == 932012546) & (multiple_replies_first_300['in_reply_to_tweet_id'] == 1366061818699989002)]

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
158573,2021-03-01 06:45:12+00:00,1366278329670770694,1,932012546,@KATIEDOLL1201 @daulan @SandySue1958 I've been...,#,annrkrist,annrkrist,False,#,...,#,#,#,200092108,KATIEDOLL1201,200092108,1366061818699989002,#,replied_to,1366061818699989002
173742,2021-03-01 07:27:26+00:00,1366288957252792322,1,932012546,"@KATIEDOLL1201 @daulan @SandySue1958 Sorry, th...",#,annrkrist,annrkrist,False,#,...,#,#,#,200092108,KATIEDOLL1201,200092108,1366061818699989002,#,replied_to,1366061818699989002


In [29]:
# group the rows by the two columns
grouped_df = multiple_replies_first_300.groupby(['author_id', 'in_reply_to_tweet_id'])

In [30]:
groups = grouped_df.groups

In [31]:
senti = PySentiStr()

In [32]:
# Set the path to the SentiStrength executable file
senti.setSentiStrengthPath(path_to_sentistrength_jar)

In [33]:
senti.setSentiStrengthLanguageFolderPath(path_to_sentistrength_language_folder)

In [34]:
def opinion_change(rows_indices):    
    texts = [ clean_text(replies.loc[index, 'text']) for index in rows_indices ]

    sentiments = senti.getSentiment(texts, score='scale')
    sentiments = np.array(sentiments)

    positive = np.any(sentiments > 0)
    negative = np.any(sentiments < 0)

    return positive and negative, sentiments

In [35]:
opinion_changes = { group: opinion_change(rows_indices)[1].tolist() for group, rows_indices in groups.items() 
                   if opinion_change(rows_indices)[0] == np.bool_('True') }

In [36]:
len(opinion_changes)

11

In [37]:
opinion_changes

{(626200532, 1366208618576502786): [1, -1],
 (626200532, 1366215607742390279): [2, -3, 0],
 (932012546, 1366061818699989002): [2, -4],
 (3083693142, 1366219890508500999): [1, -1],
 (803682424366497792, 1366201487689461760): [3, -1],
 (1123229496864174080, 1366200664402006016): [-1, 0, 0, 1, 1],
 (1296148097307832320, 1366229815725068288): [1, -1],
 (1325892488402440198, 1366077094493638660): [1, -1],
 (1327559978291367936, 1366291537319972865): [1, -2],
 (1333112666550923269, 1366041899421863937): [-1, -1, 1],
 (1345033938356502535, 1366200664402006016): [1, -2]}

In [38]:
def biggest_opinion_change(opinion_changes):
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [39]:
group, change_type = biggest_opinion_change(opinion_changes)

In [40]:
group

(932012546, 1366061818699989002)

In [41]:
change_type

'negative'

In [42]:
def save_opinion_changes_to_JSON(opinion_changes, path):
    # create a new dictionary with string keys
    opinion_changes_for_JSON_file = {str(key): value for key, value in opinion_changes.items() }
    with open(path, 'w') as file:
        json.dump(opinion_changes_for_JSON_file, file, indent=4)

In [43]:
replies_opinion_changes_path = rootdir_path + '/replies_opinion_changes_sorted.json'

In [44]:
save_opinion_changes_to_JSON(opinion_changes, replies_opinion_changes_path)

In [79]:
# multiple_replies_first_300.loc[(multiple_replies_first_300['author_id'] == 932012546) & (multiple_replies_first_300['in_reply_to_tweet_id'] == 1366061818699989002), 'text'].to_csv('/home/andreistoica12/research-internship/biggest_change.txt')

In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [96]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [97]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [24]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
