Delete all variables in the current environment (if you have already run some cells) - clean state.

In [185]:
%reset

Import all necessary packages.

NOTE: Replace the download directory of the NLTK tokenizer files with your preferred directory (I chose the root directory of the Research Internship project)

In [186]:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import shutil
from datetime import datetime
from dateutil import parser

from sentistrength import PySentiStr
import re

import nltk
nltk.download('punkt', download_dir='/home/andreistoica12/research-internship')
from nltk.tokenize import word_tokenize
import nltk.data
# Load the punkt tokenizer data from the local directory
nltk.data.load('tokenizers/punkt/PY3/english.pickle')

import json
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreistoica12/research-internship...
[nltk_data]   Package punkt is already up-to-date!


Replace with the path to the root folder of the project.

In [187]:
rootdir_path = '/home/andreistoica12/research-internship'

Replace with the path to the folder where we store the dataset.

In [188]:
data_path = '/home/andreistoica12/research-internship/data/covaxxy-csv-complete'

In [189]:
path_to_sentistrength_jar = '/home/andreistoica12/research-internship/SentiStrength/SentiStrengthCom.jar'

In [190]:
path_to_sentistrength_language_folder = '/home/andreistoica12/research-internship/SentiStrength/LanguageFolder'

In [191]:
path_to_replies_opinion_changes = rootdir_path + '/replies_opinion_changes.json'

Create 2 subfolders to store important files and graphs, respectively. If they already existed (from previous runnings of the project), delete the folders and their contents and create empty folders to store the current files and graphs, relevant to the current state of the project.

In [192]:
files_path = os.path.join(rootdir_path, 'files')
if os.path.exists(files_path):
   shutil.rmtree(files_path, ignore_errors=False, onerror=None)
os.makedirs(files_path)

graphs_path = os.path.join(rootdir_path, 'graphs')
if os.path.exists(graphs_path):
   shutil.rmtree(graphs_path, ignore_errors=False, onerror=None)
os.makedirs(graphs_path)

In [193]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')
if os.path.exists(covaxxy_graphs_path):
   shutil.rmtree(covaxxy_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_graphs_path)

In [194]:
covaxxy_longitudinal_analysis_graphs = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
if os.path.exists(covaxxy_longitudinal_analysis_graphs):
   shutil.rmtree(covaxxy_longitudinal_analysis_graphs, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_longitudinal_analysis_graphs)

We define a function remove_emoji() that takes a text string as input and uses a regular expression to match all Unicode characters that are classified as emojis. The regular expression includes different ranges of Unicode characters that represent different types of emojis, such as emoticons, symbols, and flags.

In [195]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [196]:
def remove_stopwords(text):
    # # I can use the predefined list of stopwords provided by NLTK, but it's for general purpose
    # # and the results when computing the sentiment are worse than expected, e.g. it considers
    # # words, such as "not" and "all" to be stopwords in contexts where they are actually important.
    
    # nltk.download('stopwords')
    # from nltk.corpus import stopwords
    # stop_words = set(stopwords.words('english'))


    # To avoid this problem, I defined a custom list of stopwords, which I made sure doesn't contain wrong stopwords.
    stop_words = {"the", "and", "or", "a", "an", "in", "of", "on", "to", "that", "this", "is", "are", 
                  "was", "were", "am", "be", "been", "has", "have", "had", "do", "does", "did", "will", "would", 
                  "should", "can", "could", "may", "might", "must", "shall", "shouldn't", "wouldn't", "couldn't", 
                  "can't", "mustn't", "haven't", "hasn't", "hadn't", "didn't", "doesn't", "don't"}

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [197]:
def clean_text(text):    
    # 1. Lowercase all words in the text
    text = text.lower()

    # Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 2. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 3. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 4. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    import contractions
    # 5. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 6. Remove emojis
    text = remove_emoji(text)

    # 7. Remove stopwords in English
    text = remove_stopwords(text)

    return text

In [198]:
def create_days(data_path):
    # A list of the current data files need for my analysis.
    file_list = os.listdir(data_path)

    # For simplicity's and consistency's sake, I will store all data in chronological order, 
    # so we sort the list of file names from the start.
    file_list.sort(key=lambda date: datetime.strptime(date, "tweet_ids--%Y-%m-%d.csv"))

    # I parse the date of the tweets from the file names and transform them into datetime objects. 
    # This makes it easier to get the day/month/year, as they are already properties of such type of objects.
    keys_datetime = [ datetime.strptime(key, "tweet_ids--%Y-%m-%d.csv") for key in file_list ]

    # Ultimately, I will store each .csv file as a pandas DataFrame in a dictionary, where the keys represent a 
    # simplified form of the date. So, here, I will format the dates from the datetime objects into simple strings.
    keys = [ f"{key.day}-{key.month}-{key.year}" for key in keys_datetime ]

    # In order to read the data from the files, I need the paths of the files to be passed on to the read_csv() function. 
    # The order of the days in the file paths needs to be consistent with the order of the dates in the keys
    paths = [ os.path.join(data_path, file) for file in file_list ]

    # Here, I will build the dictionary where the keys represent the formatted simple date and 
    # the values are dataframes corresponding to each file.
    days = dict()
    for i in range(len(file_list)):
        days[keys[i]] = pd.read_csv(paths[i], index_col=False)
        days[keys[i]].drop('id', axis=1, inplace=True)

    return days

In [199]:
days = create_days(data_path)

In [200]:
# Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
# I did that because some replies to a tweet posted today can come some days after, so we need to take care
# of the dataset as a whole.

# concatenate the dataframes and reset the index
merged_days = pd.concat([df for key, df in days.items()], ignore_index=True)

In [201]:
# Convert string column to datetime
merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])

In [202]:
# Sort dataframe based on datetime column
# NOTE: I also reset the index so that I know later on which tweet was posted first based on the index (useful for opinion change)
merged_days = merged_days.sort_values('created_at').reset_index(drop=True)

In [203]:
replies = merged_days[merged_days['reference_type'] == 'replied_to'].copy()

In [204]:
multiple_replies = replies[replies.duplicated(subset=['author_id', 'in_reply_to_tweet_id'], keep=False)].copy()

In [205]:
# multiple_replies_first_500 = multiple_replies.head(500).copy()

In [206]:
multiple_replies['in_reply_to_tweet_id'] = multiple_replies['in_reply_to_tweet_id'].astype(int)

In [207]:
# group the rows by the two columns
grouped_df = multiple_replies.groupby(['author_id', 'in_reply_to_tweet_id'])

In [208]:
groups_of_replies = grouped_df.groups

In [209]:
groups_of_replies[(5895742, 1368371467805728770)]

Int64Index([3424369, 3461244], dtype='int64')

In [132]:
senti = PySentiStr()
senti.setSentiStrengthPath(path_to_sentistrength_jar)
senti.setSentiStrengthLanguageFolderPath(path_to_sentistrength_language_folder)

In [135]:
group_counter = 0
progress = 0.01
def opinion_change(groups_of_replies, group, rows_indices):
    global group_counter
    global progress
    group_counter += 1

    if (group_counter / len(groups_of_replies)) >= progress:
        print(f"Progress: {group_counter} / {len(groups_of_replies)} groups of replies processed.")
        progress += 0.01


    texts = [ clean_text(replies.loc[index, 'text']) for index in rows_indices ]

    sentiments = senti.getSentiment(texts, score='scale')
    sentiments = np.array(sentiments)

    positive = np.any(sentiments > 0)
    negative = np.any(sentiments < 0)

    return positive and negative, sentiments

IMPORTANT NOTE:

THERE IS NO NEED TO RUN THE CELLS BETWEEN THE LINES BELOW!!!!!!!
It took more than 15 minutes when I ran it on the whole replies dataset...

I saved the resulting dictionary into a JSON file, which can be found in the root directory of the project. This can be imported into a dictionary with ease (code can be found in the next parts of the notebook).

--------------------------------------------------------------------------------------------------------------------------------------------

In [136]:
opinion_changes = { group: opinion_change(groups_of_replies, group, rows_indices)[1].tolist() for group, rows_indices in groups_of_replies.items() 
                   if opinion_change(groups_of_replies, group, rows_indices)[0] == np.bool_('True') }

Progress: 56 / 5587 groups of replies processed.
Progress: 112 / 5587 groups of replies processed.
Progress: 168 / 5587 groups of replies processed.
Progress: 224 / 5587 groups of replies processed.
Progress: 280 / 5587 groups of replies processed.
Progress: 336 / 5587 groups of replies processed.
Progress: 392 / 5587 groups of replies processed.
Progress: 447 / 5587 groups of replies processed.
Progress: 503 / 5587 groups of replies processed.
Progress: 559 / 5587 groups of replies processed.
Progress: 615 / 5587 groups of replies processed.
Progress: 671 / 5587 groups of replies processed.
Progress: 727 / 5587 groups of replies processed.
Progress: 783 / 5587 groups of replies processed.
Progress: 839 / 5587 groups of replies processed.
Progress: 894 / 5587 groups of replies processed.
Progress: 950 / 5587 groups of replies processed.
Progress: 1006 / 5587 groups of replies processed.
Progress: 1062 / 5587 groups of replies processed.
Progress: 1118 / 5587 groups of replies processed

In [210]:
def save_opinion_changes_to_JSON(opinion_changes, path):
    # create a new dictionary with string keys
    opinion_changes_for_JSON_file = {str(key): value for key, value in opinion_changes.items() }
    with open(path, 'w') as file:
        json.dump(opinion_changes_for_JSON_file, file, indent=4)

In [211]:
save_opinion_changes_to_JSON(opinion_changes, path_to_replies_opinion_changes)

NameError: name 'opinion_changes' is not defined

--------------------------------------------------------------------------------------------------------------------------------------------

In [212]:
def load_opinion_changes(path_to_replies_opinion_changes):
    with open(path_to_replies_opinion_changes) as f:
        # Load the JSON data into a Python dictionary
        opinion_changes_from_file = json.load(f)
        # Create a new dictionary with tuple keys
        original_opinion_changes = {}
        for key in opinion_changes_from_file:
            # Convert the string key to a tuple
            new_key = eval(key)
            # Add the key-value pair to the new dictionary
            original_opinion_changes[new_key] = opinion_changes_from_file[key]
            
    return original_opinion_changes

In [213]:
opinion_changes = load_opinion_changes(path_to_replies_opinion_changes)

In [230]:
print(f"Percentage of opinion changes from the replies: {round(len(opinion_changes) / len(groups_of_replies) * 100, 1)}%.")

Percentage of opinion changes from the replies: 15.6%.


In [215]:
def biggest_opinion_change(opinion_changes):
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [216]:
target_group, change_type = biggest_opinion_change(opinion_changes)

In [220]:
def replies_with_biggest_opinion_change(replies, target_group):
    condition1 = replies['author_id'] == target_group[0]
    condition2 = replies['in_reply_to_tweet_id'] == target_group[1]

    return replies[condition1 & condition2].loc[:, 'text'].tolist()

In [221]:
replies_biggest_change = replies_with_biggest_opinion_change(multiple_replies, target_group)

In [222]:
replies_biggest_change

["@KATIEDOLL1201 @daulan @SandySue1958 I've been taking vaccines since I was very young..  I got the vax -my SIL was SORRY she didn't.  My mom and her sis just got both covid shots -NP.  Mom's 90, her sis it is 94.  Grandma was born in 1899, saw the 1st pandemic.. She loved vaccines. She lived to 94, maybe that's why.",
 "@KATIEDOLL1201 @daulan @SandySue1958 Sorry, they're being so hard on you, but the 'majority' of people who get Shingles say it is very painful, and the 'majority' of people who have the vaccine do not have an issue.  That is by the numbers.  And right now there is soo much misinformation being spread- It's just sad."]

In [223]:
def opinion_changes_types(opinion_changes):
    changes_types = {}
    for group in opinion_changes:
        min_sentiment_index = opinion_changes[group].index(min(opinion_changes[group]))
        max_sentiment_index = opinion_changes[group].index(max(opinion_changes[group]))
        
        change_type = 'negative'
        change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type
        changes_types[group] = change_type
        
    return changes_types

In [224]:
changes_types = opinion_changes_types(opinion_changes)

In [225]:
def value_count_in_dict(dict, value_to_count):
    # Create a reverse dictionary that maps values to their frequencies
    reverse_dict = defaultdict(int)
    for value in changes_types.values():
        reverse_dict[value] += 1

    # Count the occurrences of the specific value
    count = reverse_dict.get(value_to_count, 0)

    return count

In [229]:
print(f"Percentage of positive opinion changes from the replies: {round(value_count_in_dict(changes_types, 'positive') / len(changes_types) * 100, 1)}%")

Percentage of positive opinion changes from the replies: 46.1%


In [228]:
print(f"Percentage of negative opinion changes from the replies: {round(value_count_in_dict(changes_types, 'negative') / len(changes_types) * 100, 1)}%")

Percentage of negative opinion changes from the replies: 53.9%


In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [96]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [97]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [24]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
