Delete all variables in the current environment (if you have already run some cells) - clean state.

In [1]:
%reset

Import all necessary packages.

NOTE: Replace the download directory of the NLTK tokenizer files with your preferred directory (I chose the root directory of the Research Internship project)

In [2]:
import networkx as nx
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
import shutil
from datetime import datetime
from dateutil import parser

from sentistrength import PySentiStr
import re

import nltk
nltk.download('punkt', download_dir='/home/andreistoica12/research-internship')
from nltk.tokenize import word_tokenize
import nltk.data
# Load the punkt tokenizer data from the local directory
nltk.data.load('tokenizers/punkt/PY3/english.pickle')

import json
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     /home/andreistoica12/research-internship...
[nltk_data]   Package punkt is already up-to-date!


Replace with the path to the root folder of the project.

In [3]:
rootdir_path = '/home/andreistoica12/research-internship'

Replace with the path to the folder where we store the dataset.

In [4]:
data_path = '/home/andreistoica12/research-internship/data/covaxxy-csv-complete'

In [5]:
path_to_sentistrength_jar = '/home/andreistoica12/research-internship/SentiStrength/SentiStrengthCom.jar'

In [6]:
path_to_sentistrength_language_folder = '/home/andreistoica12/research-internship/SentiStrength/LanguageFolder'

In [7]:
path_to_replies_opinion_changes = rootdir_path + '/replies_opinion_changes.json'

Create 2 subfolders to store important files and graphs, respectively. If they already existed (from previous runnings of the project), delete the folders and their contents and create empty folders to store the current files and graphs, relevant to the current state of the project.

In [8]:
files_path = os.path.join(rootdir_path, 'files')
if os.path.exists(files_path):
   shutil.rmtree(files_path, ignore_errors=False, onerror=None)
os.makedirs(files_path)

graphs_path = os.path.join(rootdir_path, 'graphs')
if os.path.exists(graphs_path):
   shutil.rmtree(graphs_path, ignore_errors=False, onerror=None)
os.makedirs(graphs_path)

In [9]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')
if os.path.exists(covaxxy_graphs_path):
   shutil.rmtree(covaxxy_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_graphs_path)

In [10]:
covaxxy_longitudinal_analysis_graphs = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
if os.path.exists(covaxxy_longitudinal_analysis_graphs):
   shutil.rmtree(covaxxy_longitudinal_analysis_graphs, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_longitudinal_analysis_graphs)

We define a function remove_emoji() that takes a text string as input and uses a regular expression to match all Unicode characters that are classified as emojis. The regular expression includes different ranges of Unicode characters that represent different types of emojis, such as emoticons, symbols, and flags.

In [11]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [12]:
def remove_stopwords(text):
    # # I can use the predefined list of stopwords provided by NLTK, but it's for general purpose
    # # and the results when computing the sentiment are worse than expected, e.g. it considers
    # # words, such as "not" and "all" to be stopwords in contexts where they are actually important.
    
    # nltk.download('stopwords')
    # from nltk.corpus import stopwords
    # stop_words = set(stopwords.words('english'))


    # To avoid this problem, I defined a custom list of stopwords, which I made sure doesn't contain wrong stopwords.
    stop_words = {"the", "and", "or", "a", "an", "in", "of", "on", "to", "that", "this", "is", "are", 
                  "was", "were", "am", "be", "been", "has", "have", "had", "do", "does", "did", "will", "would", 
                  "should", "can", "could", "may", "might", "must", "shall", "shouldn't", "wouldn't", "couldn't", 
                  "can't", "mustn't", "haven't", "hasn't", "hadn't", "didn't", "doesn't", "don't"}

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove the stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Join the filtered tokens back into a string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

In [13]:
def clean_text(text):    
    # 1. Lowercase all words in the text
    text = text.lower()

    # Replace the new line character with empty string
    text = text.replace("\n", "")
    
    # 2. Remove words starting with '@' - tags (most common noise in replies)
    text = re.sub(r'@\w+', '', text, flags=re.MULTILINE)

    # 3. Remove words starting with 'http' - hyperlinks
    text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

    # 4. Remove punctuation from the text using regular expressions
    text = re.sub(r'[^\w\s]', '', text)

    import contractions
    # 5. Remove contractions, such as you're => you are
    contractions.fix(text)

    # 6. Remove emojis
    text = remove_emoji(text)

    # 7. Remove stopwords in English
    text = remove_stopwords(text)

    return text

In [14]:
def create_days(data_path):
    # A list of the current data files need for my analysis.
    file_list = os.listdir(data_path)

    # For simplicity's and consistency's sake, I will store all data in chronological order, 
    # so we sort the list of file names from the start.
    file_list.sort(key=lambda date: datetime.strptime(date, "tweet_ids--%Y-%m-%d.csv"))

    # I parse the date of the tweets from the file names and transform them into datetime objects. 
    # This makes it easier to get the day/month/year, as they are already properties of such type of objects.
    keys_datetime = [ datetime.strptime(key, "tweet_ids--%Y-%m-%d.csv") for key in file_list ]

    # Ultimately, I will store each .csv file as a pandas DataFrame in a dictionary, where the keys represent a 
    # simplified form of the date. So, here, I will format the dates from the datetime objects into simple strings.
    keys = [ f"{key.day}-{key.month}-{key.year}" for key in keys_datetime ]

    # In order to read the data from the files, I need the paths of the files to be passed on to the read_csv() function. 
    # The order of the days in the file paths needs to be consistent with the order of the dates in the keys
    paths = [ os.path.join(data_path, file) for file in file_list ]

    # Here, I will build the dictionary where the keys represent the formatted simple date and 
    # the values are dataframes corresponding to each file.
    days = dict()
    for i in range(len(file_list)):
        days[keys[i]] = pd.read_csv(paths[i], index_col=False)
        days[keys[i]].drop('id', axis=1, inplace=True)

    return days

In [15]:
days = create_days(data_path)

In [16]:
def create_merged_days(days):
    # Here, I merged all data (from all available days) into a single dataframe (they have the same structure).
    # I did that because some replies to a tweet posted today can come some days after, so we need to take care
    # of the dataset as a whole.

    # concatenate the dataframes and reset the index
    merged_days = pd.concat([df for key, df in days.items()], ignore_index=True)

    # Convert string column to datetime
    merged_days['created_at'] = pd.to_datetime(merged_days['created_at'])

    # Sort dataframe based on datetime column
    # NOTE: I also reset the index so that I know later on which tweet was posted first based on the index (useful for opinion change)
    merged_days = merged_days.sort_values('created_at').reset_index(drop=True)

    return merged_days

In [17]:
merged_days = create_merged_days(days)

In [46]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
1,2021-02-24 18:00:18+00:00,1364636282664574978,1,26761523,Ready for DAY 2 of State of the Valley? Join u...,"jointventure.org,twitter.com,",Joint Venture SV,JointVentureSVN,False,"San Jose, CA",...,#,#,#,#,#,#,#,#,#,#
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
3,2021-02-24 18:03:16+00:00,1364637028948709377,1,1329106574082641920,"#SD37: Starting next week, @OCHealth will star...","bit.ly,www.ocregister.com,",Senator Dave Min,SenDaveMin,True,"Irvine, CA",...,#,#,#,36069538,ochealth,#,#,#,#,#
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123691,2021-03-10 23:59:52+00:00,1369800203939745796,1,434360613,RT @Philo: The boys of #SouthPark are at it ag...,#,ami_,ami_tvdfan,False,#,...,81766872,1369799981763162113,philoTV,23827692,ComedyCentral,#,#,#,retweeted,1369799981763162113
5123692,2021-03-10 23:59:52+00:00,1369800204094963712,1,3083078947,RT @ericswalwell: The #AmericanRescuePlan puts...,#,Thomas Albrecht 🇺🇸☮️,TomAlb88,False,#,...,377609596,1369727803768201218,ericswalwell,377609596,ericswalwell,#,#,#,retweeted,1369727803768201218
5123693,2021-03-10 23:59:53+00:00,1369800205592363018,1,3538956135,"RT @Doc_Wolverine: ""Gee Doc, why are you pisse...",#,Jackaxed,Jackaxed,False,United States,...,898321581444911108,1369771222481985543,Doc_Wolverine,898321581444911108,Doc_Wolverine,#,#,#,retweeted,1369771222481985543
5123694,2021-03-10 23:59:53+00:00,1369800204761899011,1,29801287,"RT @TheDweck: Wow, vision boards work",#,Fauxnly Fans,thenickkontz,False,"ÜT: 43.508306,-96.779489",...,98247788,1369742802590990336,TheDweck,98247788,TheDweck,#,#,#,retweeted,1369742802590990336


In [47]:
merged_days.columns

Index(['created_at', 'tweet_id', 'credible', 'author_id', 'text', 'urls',
       'name', 'username', 'verified', 'location', 'followers_count',
       'following_count', 'tweet_count', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'retweet_author_id', 'retweet_id',
       'retweeted_screen_name', 'user_mentions_id',
       'user_mentions_screen_name', 'in_reply_to_user_id',
       'in_reply_to_tweet_id', 'in_reply_to_username', 'reference_type',
       'reference_id'],
      dtype='object')

QUOTES

In [48]:
quotes = merged_days[merged_days['reference_type'] == 'quoted'].copy()

In [55]:
list_of_quote_texts = quotes.head(200).loc[:, 'text'].tolist()

In [57]:
# Open the file for writing (use 'a' instead of 'w' to append)
with open('/home/andreistoica12/research-internship/first_200_quotes.txt', 'w') as file:
    # Loop through the list_of_quote_texts and write each text to a new line in the file
    for text in list_of_quote_texts:
        file.write(text + '\n')

REPLIES

In [18]:
replies = merged_days[merged_days['reference_type'] == 'replied_to'].copy()

In [19]:
multiple_replies = replies[replies.duplicated(subset=['author_id', 'in_reply_to_tweet_id'], keep=False)].copy()

In [20]:
multiple_replies['in_reply_to_tweet_id'] = multiple_replies['in_reply_to_tweet_id'].astype(int)

In [21]:
# small_multiple_replies = multiple_replies.head(300).copy()

In [22]:
def create_groups_of_replies_for_opinion_change(multiple_replies):
    # group the rows by the two columns
    grouped_df = multiple_replies.groupby(['author_id', 'in_reply_to_tweet_id'])
    # grouped_df = small_multiple_replies.groupby(['author_id', 'in_reply_to_tweet_id'])

    groups_of_replies = grouped_df.groups

    return groups_of_replies

In [23]:
groups_of_replies = create_groups_of_replies_for_opinion_change(multiple_replies)

In [24]:
senti = PySentiStr()
senti.setSentiStrengthPath(path_to_sentistrength_jar)
senti.setSentiStrengthLanguageFolderPath(path_to_sentistrength_language_folder)

In [25]:
def opinion_change(rows_indices, replies):
    """Function to detect whether an opinion change occured within a group of replies.

    Args:
        rows_indices (pandas.core.indexes.numeric.Int64Index): list of indices in the original dataframe
                                                               where an opinion change has been detected
                                                               (e.g. Int64Index([1848965, 1850146, 1850687], dtype='int64'))

    Returns:
        bool: boolean value which confirms or denies the existence of an opinion change within a group
    """    
    texts = [ clean_text(replies.loc[index, 'text']) for index in rows_indices ]

    sentiments = senti.getSentiment(texts, score='scale')
    sentiments = np.array(sentiments)

    positive = np.any(sentiments > 0)
    negative = np.any(sentiments < 0)

    return positive and negative, sentiments

IMPORTANT NOTE:

THERE IS NO NEED TO RUN THE COMMENTED CELLS BETWEEN THE LINES BELOW!
It took more than 15 minutes when I ran the creation of the opinion_changes dictionary on the whole replies dataset...

I saved the resulting dictionary into a JSON file, which can be found in the root directory of the project. This can be imported into a dictionary with ease (code can be found in the next parts of the notebook).

--------------------------------------------------------------------------------------------------------------------------------------------

In [173]:
group_counter = 0
progress = 0.01

def print_progress(groups_of_replies):
    """Function that prints the progress of the computation of the opinion_changes dictionary,
    as it takes a lot of time for large datasets.

    Args:
        groups_of_replies (dict): dictionary of replies grouped by some columns
    """    
    global group_counter
    global progress
    group_counter += 1

    if ((group_counter / len(groups_of_replies)) >= progress):
        print(f"Progress: {group_counter} / {len(groups_of_replies)} groups of replies processed.")
        progress += 0.01
    if group_counter == len(groups_of_replies):
        print("All groups have been processed.")

In [174]:
def create_opinion_changes(groups_of_replies, progress_printing):
    """Function to create the data structure associated with the groups (pairs of user id-s who interacted through replies)
    where an opinio change occured, i.e. when, between their interactions (e.g. one's replies to the other's original post),
    there have been both positive and negative opinions.

    Args:
        groups_of_replies (dict): dictionary of replies grouped by some columns
        progress_printing (bool): boolean value indicating whether the user wishes to print the progress of the groups processed or not
                                  (this can be useful to track when processing large datasets - they usually take a lot of time)

    Returns:
        dict: dictionary where the keys represent the groups where opinion changes occured (as tuples) and the values are
              lists of the sentiments associated to the interactions within each group
    """    
    if progress_printing == True:
        opinion_changes = {}
        for group, rows_indices in groups_of_replies.items():
            print_progress(groups_of_replies)
            if opinion_change(rows_indices, replies)[0] == True:
                opinion_changes[group] = opinion_change(rows_indices, replies)[1].tolist()
    else:
        print("Gradual progress will not be printed.")
        print("If you wish to see it, change the value of the progress_printing input parameter to True.")
        opinion_changes = { group: opinion_change(rows_indices, replies)[1].tolist() for group, rows_indices in groups_of_replies.items() 
                        if opinion_change(rows_indices, replies)[0] == True }
    
    return opinion_changes

In [175]:
progress_printing = True

In [176]:
opinion_changes = create_opinion_changes(groups_of_replies, progress_printing)

Progress: 56 / 5587 groups of replies processed.
Progress: 112 / 5587 groups of replies processed.
Progress: 168 / 5587 groups of replies processed.
Progress: 224 / 5587 groups of replies processed.
Progress: 280 / 5587 groups of replies processed.
Progress: 336 / 5587 groups of replies processed.
Progress: 392 / 5587 groups of replies processed.
Progress: 447 / 5587 groups of replies processed.
Progress: 503 / 5587 groups of replies processed.
Progress: 559 / 5587 groups of replies processed.
Progress: 615 / 5587 groups of replies processed.
Progress: 671 / 5587 groups of replies processed.
Progress: 727 / 5587 groups of replies processed.
Progress: 783 / 5587 groups of replies processed.
Progress: 839 / 5587 groups of replies processed.
Progress: 894 / 5587 groups of replies processed.
Progress: 950 / 5587 groups of replies processed.
Progress: 1006 / 5587 groups of replies processed.
Progress: 1062 / 5587 groups of replies processed.
Progress: 1118 / 5587 groups of replies processed

In [177]:
def save_opinion_changes_to_JSON(opinion_changes, path):
    """Function to save the dictionary pf opinion changes to a JSON file.

    Args:
        opinion_changes (dict): dictionary with opinion changes
        path (str): path where you wish to save the JSON file
    """    
    # create a new dictionary with string keys
    opinion_changes_for_JSON_file = {str(key): value for key, value in opinion_changes.items() }
    with open(path, 'w') as file:
        json.dump(opinion_changes_for_JSON_file, file, indent=4)

In [178]:
save_opinion_changes_to_JSON(opinion_changes, path_to_replies_opinion_changes)

--------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
def load_opinion_changes(path_to_replies_opinion_changes):
    """Function that generates a dictionary based on a JSON file which contains the opinion changes within the replies of the dataset.

    Args:
        path_to_replies_opinion_changes (str): path to the JSON file associated with the opinion changes within the replies
                                               (e.g. /your/path/to/research-internship)

    Returns:
        dict: the original dictionary containing opinion changes from replies
    """    
    with open(path_to_replies_opinion_changes) as f:
        # Load the JSON data into a Python dictionary
        opinion_changes_from_file = json.load(f)
        # Create a new dictionary with tuple keys
        original_opinion_changes = {}
        for key in opinion_changes_from_file:
            # Convert the string key to a tuple
            new_key = eval(key)
            # Add the key-value pair to the new dictionary
            original_opinion_changes[new_key] = opinion_changes_from_file[key]
            
    return original_opinion_changes

In [27]:
opinion_changes = load_opinion_changes(path_to_replies_opinion_changes)

In [41]:
print(f"Percentage of opinion changes out of the interactions where one user replied multiple times to a source tweet:")
print(f"{round(len(opinion_changes) / len(groups_of_replies) * 100, 1)}%.")

Percentage of opinion changes out of the interactions where one user replied multiple times to a source tweet:
15.6%.


In [29]:
def biggest_opinion_change(opinion_changes):
    """Function that returns the group (pair of user id-s) which interacted more than once in the context of a single source tweet,
    i.e. one user posted more than one reply to the same source tweet, where the user who reacted had the most drastic opinion change,
    based on the previously computed sentiments of the text.

    Args:
        opinion_changes (dict): dictionary with opinion changes

    Returns:
        tuple: pair of user id-s where the biggest opinion change occured
        str: type of change that occured, e.g. one user tends to agree with the source tweet after some time, 
             when initially he disagreed or vice-versa
    """    
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [30]:
target_group, change_type = biggest_opinion_change(opinion_changes)

In [31]:
target_group

(932012546, 1366061818699989002)

In [32]:
change_type

'negative'

In [33]:
def replies_with_biggest_opinion_change(multiple_replies, target_group):
    """Function that queries the multiple_replies dataset and returns a list of the actual texts that the pair of users posted.
     The user id-s of these users are passed on as input parameters (the target group).

    Args:
        replies (pandas Dataframe): the dataframe with the replies
        target_group (tuple): pair of user id-s whose posts had the biggest opinion change

    Returns:
        list: list of texts posted by the 2 users
    """    
    condition1 = multiple_replies['author_id'] == target_group[0]
    condition2 = multiple_replies['in_reply_to_tweet_id'] == target_group[1]

    return multiple_replies[condition1 & condition2].loc[:, 'text'].tolist()

In [34]:
replies_biggest_change = replies_with_biggest_opinion_change(multiple_replies, target_group)

In [35]:
replies_biggest_change

["@KATIEDOLL1201 @daulan @SandySue1958 I've been taking vaccines since I was very young..  I got the vax -my SIL was SORRY she didn't.  My mom and her sis just got both covid shots -NP.  Mom's 90, her sis it is 94.  Grandma was born in 1899, saw the 1st pandemic.. She loved vaccines. She lived to 94, maybe that's why.",
 "@KATIEDOLL1201 @daulan @SandySue1958 Sorry, they're being so hard on you, but the 'majority' of people who get Shingles say it is very painful, and the 'majority' of people who have the vaccine do not have an issue.  That is by the numbers.  And right now there is soo much misinformation being spread- It's just sad."]

In [36]:
def opinion_change_type(opinion_changes, group):
    """Function to detect what type of opinion change occured in the case of a group (pair of user ids-s) which interacted
    through replies

    Args:
        opinion_changes (dict): dictionary with opinion changes
        group (tuple): pair of user id-s that interacted through replies and the respondent changed his viewpoint w.r.t. a source tweet

    Returns:
        str: either 'positive' (if the respondent now agrees after initially disagreeing) or 'negative'
    """    
    min_sentiment_index = opinion_changes[group].index(min(opinion_changes[group]))
    max_sentiment_index = opinion_changes[group].index(max(opinion_changes[group]))
    
    change_type = 'negative'
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return change_type

In [37]:
# Create a boolean mask indicating what type of opinion change each group has
mask = {group: opinion_change_type(opinion_changes, group) for group in opinion_changes}

In [38]:
def value_count_in_dict(dict, value_to_count):
    """Function to count the occurences of a certain value in a dictionary.

    Args:
        dict (dict): dictionary where we need to count the occurences of a value
        value_to_count (any): value to be counted

    Returns:
        int: number of occurences of value_to_count
    """    
    # Create a reverse dictionary that maps values to their frequencies
    reverse_dict = defaultdict(int)
    for value in dict.values():
        reverse_dict[value] += 1

    # Count the occurrences of the specific value
    count = reverse_dict.get(value_to_count, 0)

    return count

In [44]:
print(f"Percentage of positive opinion changes out of:")
print(f"- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'positive') / len(mask) * 100, 1)}%")

Percentage of positive opinion changes out of:
- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => 46.1%


In [45]:
print(f"Percentage of negative opinion changes out of:")
print(f"- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'negative') / len(mask) * 100, 1)}%")

Percentage of negative opinion changes out of:
- the interactions where one user replied multiple times to a source tweet and an opinion change was detected => 53.9%


In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [96]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [97]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [24]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
