Delete all variables in the current environment (if you have already run some cells) - clean state.

In [1]:
%reset

Import all necessary packages.

NOTE: Replace the download directory of the NLTK tokenizer files with your preferred directory (I chose the root directory of the Research Internship project)

In [2]:
import sys

if not hasattr(sys, 'argv'):
    sys.argv = ['']

In [3]:
import numpy as np
import pandas as pd
import os
import shutil
from datetime import datetime
from multiprocessing import Pool
import multiprocessing

from sentistrength import PySentiStr

import re
import contractions
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.data
nltk.download('punkt')
# Load the punkt tokenizer data from the local directory
nltk.data.load('tokenizers/punkt/PY3/english.pickle')

import json
from collections import defaultdict

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andreistoica12/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreistoica12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Replace with the path to the root folder of the project.

In [4]:
rootdir_path = '/home/andreistoica12/research-internship'

Create 1 subfolder to store important graphs. If it already existed (from previous runnings of the project), delete the folder and its contents and create an empty folder to store the current graphs, relevant to the current state of the project.

In [5]:
graphs_path = os.path.join(rootdir_path, 'graphs')
if os.path.exists(graphs_path):
   shutil.rmtree(graphs_path, ignore_errors=False, onerror=None)
os.makedirs(graphs_path)

In [6]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')
if os.path.exists(covaxxy_graphs_path):
   shutil.rmtree(covaxxy_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_graphs_path)

In [7]:
covaxxy_longitudinal_analysis_graphs = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
if os.path.exists(covaxxy_longitudinal_analysis_graphs):
   shutil.rmtree(covaxxy_longitudinal_analysis_graphs, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_longitudinal_analysis_graphs)

Replace with the path to the folder where we store the dataset.

In [8]:
data_path = rootdir_path + '/data/covaxxy-csv-complete'

In [9]:
path_to_sentistrength = rootdir_path + '/SentiStrength'

In [10]:
path_to_sentistrength_jar = path_to_sentistrength + '/SentiStrengthCom.jar'

In [11]:
path_to_sentistrength_language_folder = path_to_sentistrength + '/LanguageFolder'

In [12]:
files_path = rootdir_path + '/files'

In [13]:
path_to_replies_opinion_changes = files_path + '/replies_opinion_changes.json'

In [14]:
path_to_stopwords = files_path + '/stopwords.txt'

In [15]:
def custom_stop_words(path_to_stopwords):
    stop_words = set()
    with open(path_to_stopwords, 'r') as f:
        for line in f:
            word = line.strip()  # remove whitespace and newline characters
            stop_words.add(word)
    return stop_words

In [16]:
stop_words = custom_stop_words(path_to_stopwords)

In [17]:
# # I can use the predefined list of stopwords provided by NLTK, but it's for general purpose
# # and the results when computing the sentiment are worse than expected, e.g. it considers
# # words, such as "not" and "all" to be stopwords in contexts where they are actually important.
# # So, I will use a custom stopwords list.
# stop_words = set(stopwords.words('english'))

We define a function remove_emoji() that takes a text string as input and uses a regular expression to match all Unicode characters that are classified as emojis. The regular expression includes different ranges of Unicode characters that represent different types of emojis, such as emoticons, symbols, and flags.

In [18]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [19]:
def remove_stopwords(text, stop_words):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [20]:
def clean_text(text, stop_words):    
    # 1. Lowercase all words in the text
    text = text.lower()

    # 2. Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 3. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 4. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 5. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    # 6. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 7. Remove emojis
    text = remove_emoji(text)

    # 8. Remove stopwords in English
    text = remove_stopwords(text, stop_words)

    return text

In [21]:
def filter_df_by_date(path):
        # Read the CSV file into a pandas dataframe
        df_from_file = pd.read_csv(path, index_col= False)
            
        # Convert the "created_at" column to a pandas datetime object
        df_from_file['created_at'] = pd.to_datetime(df_from_file['created_at'])

        # Get all unique timestamp values from the "created_at" column
        unique_dates = df_from_file['created_at'].dt.date.unique()

        # Create a dictionary where the keys are the unique timestamp values
        # and the values are dataframes that correspond to each unique timestamp value
        days = {}
        for date in unique_dates:
            # Extract the rows that have the current timestamp value
            mask = df_from_file['created_at'].dt.date == date
            filtered_df = df_from_file[mask]
            # Store the resulting subset of rows as a dataframe in the dictionary
            days[date] = filtered_df
        
        return days

In [22]:
def create_days(data_path):
    # In order to read the data from the files, I need the paths of the files to be passed on to the read_csv() function. 
    # The order of the days in the file paths needs to be consistent with the order of the dates in the keys
    file_paths = [ os.path.join(data_path, file) for file in os.listdir(data_path) ]

    # Set the number of processes to run in parallel
    num_processes = multiprocessing.cpu_count() * 2
    # Create a pool of workers to execute the filter_df_by_date function
    with Pool(processes=num_processes) as pool:
        # Use the pool to execute the filter_df_by_date function on each file in parallel
        results = pool.map(filter_df_by_date, file_paths)

    days = dict()
    for result in results:
        days = {k: pd.concat([days.get(k, pd.DataFrame()), result.get(k, pd.DataFrame())]) for k in set(days) | set(result)}

    # Dictionary comprehension to format datetime object keys to strings - useful for ease of accessing
    days = {datetime_key.strftime('%d-%m-%Y'): df for datetime_key, df in days.items()}

    # Iterate over all the keys in the dictionary
    for key in days.keys():
        days[key].sort_values('created_at', inplace=True)
        # Drop the "id" column from the dataframe corresponding to the key
        days[key].drop('id', axis=1, inplace=True)


    return days

In [23]:
days = create_days(data_path)

In [25]:
def create_merged_days(days):
    # Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
    # I did that because some replies to a tweet posted today can come some days after, so we need to take care
    # of the dataset as a whole.

    
    # Convert string keys to datetime objects and sort them
    sorted_keys = sorted([datetime.strptime(k, '%d-%m-%Y') for k in days.keys()])

    # Convert datetime objects back to string keys with format '%d-%m-%Y'
    sorted_key_strings = [k.strftime('%d-%m-%Y') for k in sorted_keys]

    # concatenate the dataframes and reset the index
    merged_days = pd.concat([days[key] for key in sorted_key_strings], ignore_index=True)

    # Convert string column to datetime
    merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])

    return merged_days

In [26]:
merged_days = create_merged_days(days)

In [27]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
1,2021-02-24 18:00:18+00:00,1364636282664574978,1,26761523,Ready for DAY 2 of State of the Valley? Join u...,"jointventure.org,twitter.com,",Joint Venture SV,JointVentureSVN,False,"San Jose, CA",...,#,#,#,#,#,#,#,#,#,#
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
3,2021-02-24 18:03:16+00:00,1364637028948709377,1,1329106574082641920,"#SD37: Starting next week, @OCHealth will star...","bit.ly,www.ocregister.com,",Senator Dave Min,SenDaveMin,True,"Irvine, CA",...,#,#,#,36069538,ochealth,#,#,#,#,#
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123691,2021-03-10 23:59:52+00:00,1369800203939745796,1,434360613,RT @Philo: The boys of #SouthPark are at it ag...,#,ami_,ami_tvdfan,False,#,...,81766872,1369799981763162113,philoTV,23827692,ComedyCentral,#,#,#,retweeted,1369799981763162113
5123692,2021-03-10 23:59:52+00:00,1369800204094963712,1,3083078947,RT @ericswalwell: The #AmericanRescuePlan puts...,#,Thomas Albrecht 🇺🇸☮️,TomAlb88,False,#,...,377609596,1369727803768201218,ericswalwell,377609596,ericswalwell,#,#,#,retweeted,1369727803768201218
5123693,2021-03-10 23:59:53+00:00,1369800204761899011,1,29801287,"RT @TheDweck: Wow, vision boards work",#,Fauxnly Fans,thenickkontz,False,"ÜT: 43.508306,-96.779489",...,98247788,1369742802590990336,TheDweck,98247788,TheDweck,#,#,#,retweeted,1369742802590990336
5123694,2021-03-10 23:59:53+00:00,1369800205445521409,1,1095155084,just saw some lady on the news say she’s not g...,#,cristal✨,cristal_guz,False,Raleigh |22|,...,#,#,#,#,#,#,#,#,#,#


QUOTES

In [28]:
quotes = merged_days[merged_days['reference_type'] == 'quoted'].copy()

In [29]:
quotes

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
8,2021-02-24 18:05:18+00:00,1364637540330872838,1,1069764336279683072,SO many people don't realize how many inmates ...,"twitter.com,",Chern like 🧈,VisualsByChern,False,"California, USA",...,#,#,#,#,#,#,#,#,quoted,1364636471387246592
28,2021-02-24 18:27:32+00:00,1364643136199299072,1,43545377,"Just a reminder, we will have plenty of vaccin...","twitter.com,",Ann Bibby,anniegirl1138,False,Tea Time,...,#,#,#,#,#,#,#,#,quoted,1364583293295943681
45,2021-02-24 18:42:16+00:00,1364646846434435074,1,139836595,Funny how he supports the Genocide of the Abor...,"twitter.com,",mart chris,marchris52,False,BC 🇨🇦,...,#,#,#,#,#,#,#,#,quoted,1364351771209043970
69,2021-02-24 19:05:30+00:00,1364652691750748166,1,754541231372275712,Gd what a genius idea I WILL be taking ur advi...,"twitter.com,",Aidan Chase,HPEveryoneLives,False,#,...,#,#,#,427037072,waywardskyeyes,#,#,#,quoted,1364289562734960641
84,2021-02-24 19:23:33+00:00,1364657233888309249,1,24785956,"Think this will be a policy change, soon... ht...","twitter.com,",Mark I Williams M.D.,CameraGuyBakoCA,False,"Bakersfield, CA",...,#,#,#,#,#,#,#,#,quoted,1364654324387872772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123640,2021-03-10 23:59:44+00:00,1369800167386357762,1,1244589997186887683,I Trump had acted responsibly and listened to ...,"twitter.com,",Rand Paul's Neighbor - parody it's good to laugh,jbrown11871,False,Earth,...,#,#,#,#,#,#,#,#,quoted,1369793211833741312
5123641,2021-03-10 23:59:44+00:00,1369800166698582025,1,1100444916,That’s that chalkasain in you https://t.co/WBO...,"twitter.com,",Amiri ambassador,treyfive_,False,Switzerland,...,#,#,#,#,#,#,#,#,quoted,1369797672660594695
5123661,2021-03-10 23:59:47+00:00,1369800181881962497,1,2422329920,Remember this in the Coming Dark Days; \nIn 19...,"twitter.com,",❌❌ McFixit1 ❌❌,navstadt,False,Panther Burn MS,...,#,#,#,#,#,#,#,#,quoted,1369644619747778562
5123665,2021-03-10 23:59:48+00:00,1369800185786765314,1,3730426754,💥BREAKING NEWS💥\n\n#PostcardsToVoters #Resist ...,"twitter.com,",Tall Miracle 🇺🇦☮️🗳📬,miracleguppy,False,"Boise, ID USA",...,#,#,#,#,#,#,#,#,quoted,1369774083118891009


In [30]:
quotes.loc[84, 'text']

'Think this will be a policy change, soon... https://t.co/Pffvij3RKf'

In [31]:
quotes.loc[84, 'tweet_id']

1364657233888309249

In [32]:
quotes.loc[84, 'reference_id']

'1364654324387872772'

In [33]:
list_of_quote_texts = quotes.head(200).loc[:, 'text'].tolist()

In [34]:
# Open the file for writing (use 'a' instead of 'w' to append)
with open(files_path + '/first_200_quotes.txt', 'w') as file:
    # Loop through the list_of_quote_texts and write each text to a new line in the file
    for text in list_of_quote_texts:
        file.write(text + '\n')

REACTIONS

In [35]:
reaction_types = ['replied_to', 'quoted', 'retweeted']

In [36]:
def group_reactions(merged_days, reaction_types):
    reactions = merged_days[merged_days['reference_type'].isin(reaction_types)]
    multiple_reactions = reactions[reactions.duplicated(subset=['author_id', 'reference_id'], keep=False)].copy()
    multiple_reactions['reference_id'] = multiple_reactions['reference_id'].astype(int)

    # group the rows by the two columns
    grouped_df = multiple_reactions.groupby(['author_id', 'reference_id'])
    groups_of_reactions = grouped_df.groups

    return groups_of_reactions

In [37]:
groups_of_reactions = group_reactions(merged_days, reaction_types)

In [38]:
len(groups_of_reactions.keys())

34501

In [39]:
senti = PySentiStr()
senti.setSentiStrengthPath(path_to_sentistrength_jar)
senti.setSentiStrengthLanguageFolderPath(path_to_sentistrength_language_folder)

In [40]:
def compute_sentiments(rows_indices, reactions, stop_words):
    texts = [ clean_text(reactions.loc[index, 'text'], stop_words) 
             if reactions.loc[index, 'reference_type'] != 'retweeted' else 'extremely fabulous'
             for index in rows_indices ]
    
    sentiments = senti.getSentiment(texts, score='scale')

    return sentiments

In [41]:
def opinion_change(rows_indices, reactions, stop_words):
    """Function to detect whether an opinion change occured within a group of reactions (replies/quotes/retweets).

    Args:
        rows_indices (pandas.core.indexes.numeric.Int64Index): list of indices in the original dataframe
                                                               where an opinion change has been detected
                                                               (e.g. Int64Index([1848965, 1850146, 1850687], dtype='int64'))

    Returns:
        bool: boolean value which confirms or denies the existence of an opinion change within a group
    """ 
    sentiments = compute_sentiments(rows_indices, reactions, stop_words)
    sentiments = np.array(sentiments)

    positive = np.any(sentiments > 0)
    negative = np.any(sentiments < 0)

    return positive and negative

CREATION OF TEST DATA

In [80]:
def create_test_data(merged_days):
    test_replies = merged_days[merged_days['reference_type'] == 'replied_to']
    multiple_test_replies = test_replies[test_replies.duplicated(subset=['author_id', 'reference_id'], keep=False)].copy()
    multiple_test_replies['reference_id'] = multiple_test_replies['reference_id'].astype(int)
    small_multiple_test_replies = multiple_test_replies.head(2000).copy()

    # group the rows by the two columns
    grouped_df = small_multiple_test_replies.groupby(['author_id', 'reference_id'])
    # grouped_df = multiple_test_replies.groupby(['author_id','reference_id'])
    groups_of_test_replies = grouped_df.groups

    return groups_of_test_replies

In [81]:
groups_test_data = create_test_data(merged_days)

IMPORTANT NOTE:

THERE IS NO NEED TO RUN THE COMMENTED CELLS BETWEEN THE LINES BELOW!
It took more than 15 minutes when I ran the creation of the opinion_changes dictionary on the whole replies dataset...

I saved the resulting dictionary into a JSON file, which can be found in the root directory of the project. This can be imported into a dictionary with ease (code can be found in the next parts of the notebook).

--------------------------------------------------------------------------------------------------------------------------------------------

SEQUENTIAL COMPUTATION - OPINION CHANGES

In [84]:
group_counter = 0
progress = 0.001

def print_progress(groups_of_reactions):
    """Function that prints the progress of the computation of the opinion_changes dictionary,
    as it takes a lot of time for large datasets.

    Args:
        groups_of_replies (dict): dictionary of replies grouped by some columns
    """    
    global group_counter
    global progress
    group_counter += 1

    if ((group_counter / len(groups_of_reactions)) >= progress):
        print(f"Progress: {group_counter} / {len(groups_of_reactions)} groups of reactions processed.")
        progress += 0.001
    if group_counter == len(groups_of_reactions):
        print("All groups have been processed.")

In [85]:
def create_opinion_changes(groups_of_reactions, reactions, progress_printing, stop_words):
    """Function to create the data structure associated with the groups (pairs of user id-s who interacted through replies)
    where an opinio change occured, i.e. when, between their interactions (e.g. one's replies to the other's original post),
    there have been both positive and negative opinions.

    Args:
        groups_of_replies (dict): dictionary of replies grouped by some columns
        progress_printing (bool): boolean value indicating whether the user wishes to print the progress of the groups processed or not
                                  (this can be useful to track when processing large datasets - they usually take a lot of time)

    Returns:
        dict: dictionary where the keys represent the groups where opinion changes occured (as tuples) and the values are
              lists of the sentiments associated to the interactions within each group
    """    
    if progress_printing == True:
        opinion_changes = {}
        for group, rows_indices in groups_of_reactions.items():
            print_progress(groups_of_reactions)
            if opinion_change(rows_indices, reactions, stop_words) == True:
                opinion_changes[group] = compute_sentiments(rows_indices, reactions, stop_words)
    else:
        print("Gradual progress will not be printed.")
        print("If you wish to see it, change the value of the progress_printing input parameter to True.")
        opinion_changes = { group: compute_sentiments(rows_indices, reactions, stop_words) for group, rows_indices in groups_of_reactions.items() 
                        if opinion_change(rows_indices, reactions, stop_words) == True }
    
    return opinion_changes

In [86]:
progress_printing = True

In [85]:
opinion_changes = create_opinion_changes(groups_test_data, merged_days, progress_printing, stop_words)

PARALLEL COMPUTATION - OPINION CHANGES

In [42]:
def process_values(rows_indexes):
    global merged_days
    global stop_words
    processed_values = []
    if opinion_change(rows_indexes, merged_days, stop_words):
        texts = [ clean_text(merged_days.loc[index, 'text'], stop_words) for index in rows_indexes ]
        
        processed_values = senti.getSentiment(texts, score='scale')

    return processed_values

In [43]:
def process_dict_chunk(input_dict):
    # Process a chunk of the input dictionary
    processed_dict = {}
    counter = 0
    progress = 0.0001
    
    for key, index_list in input_dict.items():
        processed_values = process_values(index_list)
        if processed_values:  # only add non-empty lists to the dictionary
            processed_dict[key] = processed_values

        counter += 1
        if ((counter / len(input_dict)) >= progress):
            print(f"Processed {counter} / {len(input_dict)} entries\n")
            progress += 0.0001
        if counter == len(input_dict):
            print(f"Process has finished processing all {len(input_dict)} entries.")


    return processed_dict

In [44]:
def process_dict_in_parallel(input_dict, num_processes=None):
    # Default to using all available CPU cores
    if num_processes is None:
        num_processes = multiprocessing.cpu_count()

    # Split the input dictionary into smaller chunks for parallel processing
    chunk_size = len(input_dict) // num_processes
    input_chunks = [dict(list(input_dict.items())[i:i + chunk_size]) for i in range(0, len(input_dict), chunk_size)]

    # Process the input chunks in parallel using a pool of worker processes
    with multiprocessing.Pool(processes=num_processes) as pool:
        processed_dicts = pool.map(process_dict_chunk, input_chunks)

    # Merge the processed dictionaries from each input chunk
    processed_dict = {}
    for d in processed_dicts:
        processed_dict.update(d)

    return processed_dict

In [84]:
opinion_changes_parallel = process_dict_in_parallel(groups_of_reactions)

In [64]:
len(opinion_changes_parallel)

7482

SAVE DICTIONARY TO JSON FILE

In [65]:
def create_path_to_opinion_changes(reaction_types):
    type = "_".join(reaction_types)
    path = files_path + f"/{type}_opinion_changes.json"

    return path

In [66]:
def save_opinion_changes_to_JSON(opinion_changes, reaction_types):
    """Function to save the dictionary of opinion changes to a JSON file.

    Args:
        opinion_changes (dict): dictionary with opinion changes
        path (str): path where you wish to save the JSON file
    """    
    path = create_path_to_opinion_changes(reaction_types)

    # create a new dictionary with string keys
    opinion_changes_for_JSON_file = {str(key): value for key, value in opinion_changes.items() }
    with open(path, 'w') as file:
        json.dump(opinion_changes_for_JSON_file, file, indent=4)

In [67]:
save_opinion_changes_to_JSON(opinion_changes_parallel, reaction_types)

--------------------------------------------------------------------------------------------------------------------------------------------

LOAD DICTIONARY FROM JSON FILE

In [68]:
def load_opinion_changes(path_to_opinion_changes):
    """Function that generates a dictionary based on a JSON file which contains the opinion changes within the replies of the dataset.

    Args:
        path_to_replies_opinion_changes (str): path to the JSON file associated with the opinion changes within the replies
                                               (e.g. /your/path/to/research-internship)

    Returns:
        dict: the original dictionary containing opinion changes from replies
    """    
    with open(path_to_opinion_changes) as f:
        # Load the JSON data into a Python dictionary
        opinion_changes_from_file = json.load(f)
        # Create a new dictionary with tuple keys
        original_opinion_changes = {}
        for key in opinion_changes_from_file:
            # Convert the string key to a tuple
            new_key = eval(key)
            # Add the key-value pair to the new dictionary
            original_opinion_changes[new_key] = opinion_changes_from_file[key]
            
    return original_opinion_changes

In [69]:
path_to_opinion_changes = create_path_to_opinion_changes(reaction_types)

In [70]:
opinion_changes = load_opinion_changes(path_to_opinion_changes)

In [82]:
len(opinion_changes)

7482

In [83]:
opinion_changes == opinion_changes_parallel

True

INSIGHTS

In [72]:
print(f"Percentage of opinion changes out of the interactions where one user replied multiple times to a source tweet:")
print(f"{round(len(opinion_changes) / len(groups_of_reactions) * 100, 1)}%.")

Percentage of opinion changes out of the interactions where one user replied multiple times to a source tweet:
21.7%.


In [73]:
def biggest_opinion_change(opinion_changes):
    """Function that returns the group (pair of user id-s) which interacted more than once in the context of a single source tweet,
    i.e. one user posted more than one reply to the same source tweet, where the user who reacted had the most drastic opinion change,
    based on the previously computed sentiments of the text.

    Args:
        opinion_changes (dict): dictionary with opinion changes

    Returns:
        tuple: pair of user id-s where the biggest opinion change occured
        str: type of change that occured, e.g. one user tends to agree with the source tweet after some time, 
             when initially he disagreed or vice-versa
    """    
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [74]:
target_group, change_type = biggest_opinion_change(opinion_changes)

In [75]:
target_group

(118788479, 1367613364923424769)

In [76]:
change_type

'negative'

In [58]:
def replies_with_biggest_opinion_change(multiple_replies, target_group):
    """Function that queries the multiple_replies dataset and returns a list of the actual texts that the pair of users posted.
     The user id-s of these users are passed on as input parameters (the target group).

    Args:
        replies (pandas Dataframe): the dataframe with the replies
        target_group (tuple): pair of user id-s whose posts had the biggest opinion change

    Returns:
        list: list of texts posted by the 2 users
    """    
    condition1 = multiple_replies['author_id'] == target_group[0]
    condition2 = multiple_replies['in_reply_to_tweet_id'] == target_group[1]

    return multiple_replies[condition1 & condition2].loc[:, 'text'].tolist()

In [59]:
replies_biggest_change = replies_with_biggest_opinion_change(multiple_replies, target_group)

In [60]:
replies_biggest_change

["@KATIEDOLL1201 @daulan @SandySue1958 I've been taking vaccines since I was very young..  I got the vax -my SIL was SORRY she didn't.  My mom and her sis just got both covid shots -NP.  Mom's 90, her sis it is 94.  Grandma was born in 1899, saw the 1st pandemic.. She loved vaccines. She lived to 94, maybe that's why.",
 "@KATIEDOLL1201 @daulan @SandySue1958 Sorry, they're being so hard on you, but the 'majority' of people who get Shingles say it is very painful, and the 'majority' of people who have the vaccine do not have an issue.  That is by the numbers.  And right now there is soo much misinformation being spread- It's just sad."]

In [77]:
def opinion_change_type(opinion_changes, group):
    """Function to detect what type of opinion change occured in the case of a group (pair of user ids-s) which interacted
    through replies

    Args:
        opinion_changes (dict): dictionary with opinion changes
        group (tuple): pair of user id-s that interacted through replies and the respondent changed his viewpoint w.r.t. a source tweet

    Returns:
        str: either 'positive' (if the respondent now agrees after initially disagreeing) or 'negative'
    """    
    min_sentiment_index = opinion_changes[group].index(min(opinion_changes[group]))
    max_sentiment_index = opinion_changes[group].index(max(opinion_changes[group]))
    
    change_type = 'negative'
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return change_type

In [78]:
# Create a boolean mask indicating what type of opinion change each group has
mask = {group: opinion_change_type(opinion_changes, group) for group in opinion_changes}

In [79]:
def value_count_in_dict(dict, value_to_count):
    """Function to count the occurences of a certain value in a dictionary.

    Args:
        dict (dict): dictionary where we need to count the occurences of a value
        value_to_count (any): value to be counted

    Returns:
        int: number of occurences of value_to_count
    """    
    # Create a reverse dictionary that maps values to their frequencies
    reverse_dict = defaultdict(int)
    for value in dict.values():
        reverse_dict[value] += 1

    # Count the occurrences of the specific value
    count = reverse_dict.get(value_to_count, 0)

    return count

In [80]:
print(f"Percentage of positive opinion changes out of:")
print(f"- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'positive') / len(mask) * 100, 1)}%")

Percentage of positive opinion changes out of:
- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => 32.2%


In [81]:
print(f"Percentage of negative opinion changes out of:")
print(f"- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'negative') / len(mask) * 100, 1)}%")

Percentage of negative opinion changes out of:
- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => 67.8%


In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [341]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [342]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [343]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
