Delete all variables in the current environment (if you have already run some cells) - clean state.

In [1]:
%reset

Import all necessary packages.

In [2]:
import pandas as pd
import os
import shutil
import json
from collections import defaultdict
from collections import Counter
from IPython.core.getipython import get_ipython
from matplotlib import pyplot as plt
import textwrap

In [3]:
rootdir_path = os.getcwd()

Replace with the path to the folder where the raw dataset (the initial .csv files) is stored.

In [4]:
dataset_possibilities = ['15_days', '25_days']

In [5]:
number_of_days = dataset_possibilities[1]

In [6]:
data_path = os.path.join(rootdir_path, 'data', f'covaxxy_merged_{number_of_days}.csv')

In [7]:
files_path = os.path.join(rootdir_path, 'files')

In [8]:
path_to_unique_dates = os.path.join(files_path, f'unique_dates_{number_of_days}.txt')

In [9]:
opinion_changes_path = os.path.join(files_path, f'opinion-changes-{number_of_days}')

In [10]:
graphs_path = os.path.join(rootdir_path, 'graphs')

In [11]:
covaxxy_graphs_path = os.path.join(graphs_path, 'covaxxy')

Create subfolders specific to the different types of analyses performed in the project.

Create 1 subfolder within the graphs/covaxxy/ folder to store graphs referring to differences in opinion changes for the covaxxy dataset. If it already existed (from previous runnings of the project), delete the folder and its contents and create an empty folder to store the current graphs, relevant to the current state of the project.

In [12]:
covaxxy_deltas_OC_graphs_path = os.path.join(covaxxy_graphs_path, f'deltas-OC-{number_of_days}')
if os.path.exists(covaxxy_deltas_OC_graphs_path):
   shutil.rmtree(covaxxy_deltas_OC_graphs_path, ignore_errors=False, onerror=None)
os.makedirs(covaxxy_deltas_OC_graphs_path)

In [13]:
# covaxxy_longitudinal_analysis_graphs_path = os.path.join(covaxxy_graphs_path, 'longitudinal-analysis')
# if os.path.exists(covaxxy_longitudinal_analysis_graphs_path):
#    shutil.rmtree(covaxxy_longitudinal_analysis_graphs_path, ignore_errors=False, onerror=None)
# os.makedirs(covaxxy_longitudinal_analysis_graphs_path)

In [14]:
merged_days = pd.read_csv(data_path)

In [15]:
def string_to_int(reference_id):
    try:
        return int(reference_id)
    except ValueError:
        return reference_id

In [16]:
merged_days['reference_id'] = merged_days['reference_id'].apply(string_to_int)

In [17]:
merged_days

Unnamed: 0,created_at,tweet_id,credible,author_id,text,urls,name,username,verified,location,...,retweet_author_id,retweet_id,retweeted_screen_name,user_mentions_id,user_mentions_screen_name,in_reply_to_user_id,in_reply_to_tweet_id,in_reply_to_username,reference_type,reference_id
0,2021-02-24 18:00:10+00:00,1364636249852502018,1,107501328,RT @Maricopahealth: At one of our community po...,#,2-1-1 Arizona,211arizona,False,Arizona,...,29816986,1364632754042802176,Maricopahealth,29816986,Maricopahealth,#,#,#,retweeted,1364632754042802176
1,2021-02-24 18:00:18+00:00,1364636282664574978,1,26761523,Ready for DAY 2 of State of the Valley? Join u...,"jointventure.org,twitter.com,",Joint Venture SV,JointVentureSVN,False,"San Jose, CA",...,#,#,#,#,#,#,#,#,#,#
2,2021-02-24 18:00:30+00:00,1364636333596008449,1,1234926105234034689,RT @SteveStaeger: When #COVID19Colorado is ove...,#,Colorado Coronavirus Updates,COVIDinColorado,False,"Denver, Colorado",...,182037688,1364293582157307906,SteveStaeger,182037688,SteveStaeger,#,#,#,retweeted,1364293582157307906
3,2021-02-24 18:03:16+00:00,1364637028948709377,1,1329106574082641920,"#SD37: Starting next week, @OCHealth will star...","bit.ly,www.ocregister.com,",Senator Dave Min,SenDaveMin,True,"Irvine, CA",...,#,#,#,36069538,ochealth,#,#,#,#,#
4,2021-02-24 18:03:35+00:00,1364637110951583746,1,1363750425459970048,RT @jatinde45666597: Vaccination has been star...,#,Reena Sharma,write2reena,False,"Auckland, New Zealand",...,1295748297529884673,1364087633538859008,jatinde45666597,1295748297529884673,jatinde45666597,#,#,#,retweeted,1364087633538859008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723864,2021-03-20 23:59:42+00:00,1373424038111023106,1,827349983577796608,RT @altNOAA: PSA: If you're in #Oklahoma and a...,#,Howard Hudson,hwrdhdsn,False,"Medford, OR",...,824126001936474113,1373418517484220418,altNOAA,824126001936474113,altNOAA,#,#,#,retweeted,1373418517484220418
10723865,2021-03-20 23:59:42+00:00,1373424038895321101,1,828337737388285952,RT @maura_resister: Covid rates are trending i...,#,KY R.N. Votes Blue😷🦋🌊,changemustcome7,False,Blue dot in red state,...,1167775229109837825,1373420544830361602,maura_resister,1167775229109837825,maura_resister,#,#,#,retweeted,1373420544830361602
10723866,2021-03-20 23:59:43+00:00,1373424044100440072,1,126514742,RT @Best_of_PT: The tiny island of #Corvo in t...,#,Javier RG,Rxavier23,False,#,...,894905209159331840,1373313858476068869,Best_of_PT,894905209159331840,Best_of_PT,#,#,#,retweeted,1373313858476068869
10723867,2021-03-20 23:59:43+00:00,1373424044318420993,1,2918822815,@sethsliltweeter @EncoreBeachClub That’s a goo...,#,Jason Titus,eezeemonee,False,"San Jose, CA",...,#,#,#,1289682623757918209,sethsliltweeter,1289682623757918209,1373417100472033283,sethsliltweeter,replied_to,1373417100472033283


In [18]:
merged_days.columns

Index(['created_at', 'tweet_id', 'credible', 'author_id', 'text', 'urls',
       'name', 'username', 'verified', 'location', 'followers_count',
       'following_count', 'tweet_count', 'like_count', 'quote_count',
       'reply_count', 'retweet_count', 'retweet_author_id', 'retweet_id',
       'retweeted_screen_name', 'user_mentions_id',
       'user_mentions_screen_name', 'in_reply_to_user_id',
       'in_reply_to_tweet_id', 'in_reply_to_username', 'reference_type',
       'reference_id'],
      dtype='object')

In [19]:
# Create a list of the column names
col_names = merged_days.columns.to_list()

# save the list to a file
with open(os.path.join(files_path, 'columns.txt'), 'w') as f:
    for col_name in col_names:
        f.write(col_name + '\n')

EXACT DAYS IN OUR DATASET:

Note: I double checked which days were actually used in the dataset.

In [20]:
exact_days_info = []

with open(path_to_unique_dates, 'r') as f:
    for line in f:
        exact_days_info.append(line.strip())

for day_info in exact_days_info:
    print(day_info)

24-02-2021 - 334 tweets
25-02-2021 - 933 tweets
26-02-2021 - 1110 tweets
27-02-2021 - 871 tweets
28-02-2021 - 792 tweets
01-03-2021 - 608540 tweets
02-03-2021 - 691842 tweets
03-03-2021 - 667152 tweets
04-03-2021 - 516891 tweets
05-03-2021 - 504369 tweets
06-03-2021 - 401596 tweets
07-03-2021 - 362643 tweets
08-03-2021 - 359209 tweets
09-03-2021 - 466151 tweets
10-03-2021 - 541263 tweets
11-03-2021 - 667231 tweets
12-03-2021 - 745042 tweets
13-03-2021 - 431356 tweets
14-03-2021 - 442648 tweets
15-03-2021 - 586684 tweets
16-03-2021 - 674996 tweets
17-03-2021 - 536724 tweets
18-03-2021 - 581944 tweets
19-03-2021 - 501221 tweets
20-03-2021 - 432327 tweets


REACTIONS

There are 3 types of reactions:
- replies ('replied_to')
- quotes ('quoted')
- retweets ('retweeted')

All possible combinations of reactions types you may wish to take into account further down the line are specified in the full list below. 

The reaction_types list should be equal to one of the elements of the full list.

In [21]:
reaction_types_full_list = [['quoted'], 
                            ['quoted', 'retweeted'], 
                            ['replied_to'], 
                            ['replied_to', 'quoted'], 
                            ['replied_to', 'quoted', 'retweeted'],
                            ['replied_to', 'retweeted']]

Here, you can choose what (combination of) reaction types you wish to be included in the analysis.

In [22]:
reaction_types = reaction_types_full_list[0]

In [23]:
reaction_types

['quoted']

In [24]:
def create_path_to_opinion_changes(reaction_types):
    """Function to create the path to the opinion changes JSON file, based on the reaction types we took into consideration.

    Args:
        reaction_types (list): list of reaction types

    Returns:
        str: path to the opinion changes file
    """    
    type = "_".join(reaction_types)
    path = os.path.join(opinion_changes_path, f'{type}_OC.json')

    return path

In [25]:
def group_reactions(merged_days, reaction_types):
    """Function to group reactions based on the reaction types list given as an input parameter, by the
    'author_id' and 'reference_id' columns. This means that each group of reactions contains a (set of) reaction(s)
    posted by the user identified by the 'author_id' and the source tweet identified by the 'reference_id'.

    Args:
        merged_days (pandas.core.frame.DataFrame): dataframe with all the data
        reaction_types (list): list of reaction types we want to consider

    Returns:
        dict: dictionary where the key is a tuple of the form (author_id, reference_id)
              and the value is a dataframe with all reactions corresponding to that combination
    """    
    reactions = merged_days[merged_days['reference_type'].isin(reaction_types)]
    multiple_reactions = reactions[reactions.duplicated(subset=['author_id', 'reference_id'], keep=False)]

    # group the rows by the two columns
    grouped_df = multiple_reactions.groupby(['author_id', 'reference_id'])
    groups_of_reactions = grouped_df.groups

    return groups_of_reactions

In [26]:
groups_of_reactions = group_reactions(merged_days, reaction_types)

In [27]:
len(groups_of_reactions)

8692

LOAD DICTIONARY FROM JSON FILE

In [28]:
def load_opinion_changes(path_to_opinion_changes):
    """Function that generates a dictionary based on a JSON file which contains the opinion changes within the reactions of the dataset.

    Args:
        path_to_opinion_changes (str): path to the JSON file associated with the opinion changes within the reactions
                                               (e.g. /your/path/to/research-internship/files/opinion-changes-25_days/quoted_OC.json)

    Returns:
        dict: the original dictionary containing opinion changes from reactions
    """    
    with open(path_to_opinion_changes) as f:
        # Load the JSON data into a Python dictionary
        opinion_changes_from_file = json.load(f)
        # Create a new dictionary with tuple keys
        original_opinion_changes = {}
        for key in opinion_changes_from_file:
            # Convert the string key to a tuple
            new_key = eval(key)
            # Add the key-value pair to the new dictionary
            original_opinion_changes[new_key] = opinion_changes_from_file[key]
            
    return original_opinion_changes

In [29]:
opinion_changes = load_opinion_changes(create_path_to_opinion_changes(reaction_types))

INSIGHTS

In [30]:
print(f"Percentage of opinion changes out of the interactions where one user reacted multiple times to a source tweet:")
print(f"{round(len(opinion_changes) / len(groups_of_reactions) * 100, 1)}%.")

Percentage of opinion changes out of the interactions where one user reacted multiple times to a source tweet:
13.1%.


In [31]:
def biggest_opinion_change(opinion_changes):
    """Function that returns the group (pair of user id - source tweet id) which interacted more than once 
    in the context of a single source tweet, i.e. one user posted more than one reply to the same source tweet, 
    where the user who reacted had the most drastic opinion change,
    based on the previously computed sentiments of the text.

    Args:
        opinion_changes (dict): dictionary with opinion changes

    Returns:
        tuple: pair of user id - source tweet id, where the biggest opinion change occured
        str: type of change that occured, e.g. one user tends to agree with the source tweet after some time, 
             when initially he disagreed or vice-versa
    """    
    change_type = 'negative'
    biggest_change = 0
    target_group = tuple()
    for group, sentiments in opinion_changes.items():
        change = max(biggest_change, max(sentiments) - min(sentiments))
        if change > biggest_change:
            biggest_change = change
            target_group = group
    
    min_sentiment_index = opinion_changes[target_group].index(min(opinion_changes[target_group]))
    max_sentiment_index = opinion_changes[target_group].index(max(opinion_changes[target_group]))
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return target_group, change_type

In [32]:
target_group, change_type = biggest_opinion_change(opinion_changes)

In [33]:
target_group

(25098310, 1372044288083787776)

In [34]:
change_type

'negative'

In [35]:
def reactions_with_biggest_opinion_change(reactions, target_group):
    """Function that queries the reactions dataset and returns a list of the actual texts that the pair of users
     (the author of the reaction and the author of the source tweet) posted.
     The user id and source tweet id are passed on as input parameters (the target group).

    Args:
        replies (pandas Dataframe): the dataframe with the reactions
        target_group (tuple): pair of user ids - source tweet id, whose posts had the biggest opinion change

    Returns:
        list: list of texts posted by the 2 users
    """    
    condition1 = reactions['author_id'] == target_group[0]
    condition2 = reactions['reference_id'] == target_group[1]

    return reactions[condition1 & condition2].loc[:, 'text'].tolist()

In [36]:
reactions_biggest_change = reactions_with_biggest_opinion_change(merged_days, target_group)

In [37]:
reactions_biggest_change

['Very much so looking forward to outdoor music, Joe Biden length hugs, and staying home cause I want to, not cause I need to. https://t.co/7VuVM02mwq',
 'Side effects: sore arm, and an overwhelming dread of having to go back to work tomorrow. https://t.co/7VuVM02mwq']

In [38]:
def biggest_opinion_change_type(opinion_changes, group):
    """Function to detect what type of opinion change occured in the case of a group (pair of user id - source tweet id) 
    which interacted.

    Args:
        opinion_changes (dict): dictionary with opinion changes
        group (tuple): pair of user id - source tweet id that interacted through reactions 
                       and the respondent changed his/her viewpoint w.r.t. a source tweet

    Returns:
        str: either 'positive' (if the respondent now agrees after initially disagreeing) or 'negative'
    """    
    min_sentiment_index = opinion_changes[group].index(min(opinion_changes[group]))
    max_sentiment_index = opinion_changes[group].index(max(opinion_changes[group]))
    
    change_type = 'negative'
    change_type = 'positive' if min_sentiment_index < max_sentiment_index else change_type

    return change_type

In [39]:
# Create a boolean mask indicating what type of opinion change each group has
mask = {group: biggest_opinion_change_type(opinion_changes, group) for group in opinion_changes}

In [40]:
def value_count_in_dict(dict, value_to_count):
    """Function to count the occurences of a certain value in a dictionary.

    Args:
        dict (dict): dictionary where we need to count the occurences of a value
        value_to_count (any): value to be counted

    Returns:
        int: number of occurences of value_to_count
    """    
    # Create a reverse dictionary that maps values to their frequencies
    reverse_dict = defaultdict(int)
    for value in dict.values():
        reverse_dict[value] += 1

    # Count the occurrences of the specific value
    count = reverse_dict.get(value_to_count, 0)

    return count

In [41]:
print(f"Percentage of positive opinion changes out of:")
print(f"- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'positive') / len(mask) * 100, 1)}%")

Percentage of positive opinion changes out of:
- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => 50.7%


In [42]:
print(f"Percentage of negative opinion changes out of:")
print(f"- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => {round(value_count_in_dict(mask, 'negative') / len(mask) * 100, 1)}%")

Percentage of negative opinion changes out of:
- the interactions where one user reacted multiple times to a source tweet and an opinion change was detected => 49.3%


In [43]:
def compute_biggest_opinion_changes_deltas(path_to_opinion_changes):
    opinion_changes = load_opinion_changes(path_to_opinion_changes)
    deltas = { key: max(value) - min(value) for key, value in opinion_changes.items() }

    return deltas

In [44]:
deltas_labels = {
    2: 'minimum',
    3: 'slight',
    4: 'considerable',
    5: 'big',
    6: 'very big',
    7: 'huge',
    8: 'maximum'
}

In [45]:
# function to add value labels - adds the value of y
def add_labels_y_value(x,y):
    """Function that takes the x and y-axis to be passed onto a plot function and generates labels,
    such that on top of each y value, it is displayed centrally.

    Args:
        x (list): list of labels for x-axis of a plot
        y (list): list of values for y-axis of a plot
    """    
    for i in range(len(x)):
        plt.text(x[i], y[i], y[i], ha = 'center', va = 'bottom')

In [46]:
def plot_deltas_OC(reaction_types, deltas_labels, root_path, number_of_days):
    opinion_changes_deltas = compute_biggest_opinion_changes_deltas(create_path_to_opinion_changes(reaction_types))
    # Get the values from the dictionary
    deltas = list(opinion_changes_deltas.values())

    # Use Counter to count the occurrences of each value
    deltas_count = Counter(deltas)
    percentages = { deltas_labels[pair[0]]: round(pair[1] / sum(deltas_count.values()) * 100, 1) 
               for pair in sorted(deltas_count.most_common(), key=lambda x: x[0])}
    
    keys = list(percentages.keys())
    values = list(percentages.values())

    # Create a bar chart of the counts
    plt.bar(keys, values, edgecolor='black')
    # Add labels to the top of each bar
    add_labels_y_value(keys, values)
    plt.xlabel('Biggest difference in opinion')
    plt.ylabel('Percentage of groups')

    long_title = f'Distribution of the intensity of the biggest opinion changes for { ", ".join(reaction_types) }'
    # Wrap the title onto multiple lines
    wrapped_title = textwrap.fill(long_title, width=50)
    plt.title(wrapped_title, loc="center", pad=10)

    types = "_".join(reaction_types)
    path = os.path.join(root_path, f'{types}_deltas_OC_{number_of_days}.png')
    plt.savefig(path)
    plt.close()

In [47]:
def plot_all_deltas_OC(reaction_types_full_list, deltas_labels, root_path, number_of_days):
    for reaction_types in reaction_types_full_list:
        plot_deltas_OC(reaction_types, deltas_labels, root_path, number_of_days)

In [48]:
plot_all_deltas_OC(reaction_types_full_list, deltas_labels, covaxxy_deltas_OC_graphs_path, number_of_days)

In [50]:
# TODO: ADD VISUALIZATIONS FOR ALL INSIGHTS + IMPROVE PRINT MESSAGES

NEW COLUMN ADDITION

CREATION OF REPLIES_AND_QUOTES DATAFRAME, FOR WHICH WE WANT TO SEE IF THEY SUPPORT OR NOT THE SOURCE TWEET.

In [56]:
def create_replies_and_quotes(full_dataset):
    condition_1 = full_dataset['reference_type'] == 'replied_to'
    condition_2 = full_dataset['reference_type'] == 'quoted'

    return full_dataset[condition_1 | condition_2].copy()

In [57]:
replies_and_quotes = create_replies_and_quotes(merged_days)

HELPER FUNCTIONS TO ADD A NEW COLUMN TO THE test_replies_and_quotes DATAFRAME IN PARALLEL.

In [58]:
test_replies_and_quotes = replies_and_quotes.head(1000).copy()

In [59]:
counter = 0
progress = 0.001

In [60]:
def print_progress():
    global counter
    global progress
    global test_replies_and_quotes
    ipython = get_ipython()
    if ipython is not None:
        counter = ipython.user_ns['counter']
        progress = ipython.user_ns['progress']
        test_replies_and_quotes = ipython.user_ns['test_replies_and_quotes']

    counter += 1

    if ((counter / len(test_replies_and_quotes)) >= progress):
        print(f"{counter} / {len(test_replies_and_quotes)} replies or quotes processed.\n")
        progress += 0.001
    if counter == len(test_replies_and_quotes):
        print("New column inserted in the replies_and_quotes dataframe.\n")

In [61]:
def supports_source_tweet(text):
    if not isinstance(text, str):
        return '#'
    
    sentiment = senti.getSentiment(text, score='scale')[0]

    # print_progress()
    
    return sentiment > 0

In [62]:
# Define a wrapper function that applies supports_source-tweet to a chunk of data
def apply_function_to_chunk(chunk):
    return chunk.apply(supports_source_tweet)

In [63]:
def add_support_source_tweet_column_parallel(replies_and_quotes):
    # Split the DataFrame into chunks for parallel processing
    num_chunks = multiprocessing.cpu_count()
    chunks = np.array_split(replies_and_quotes['text'], num_chunks)

    # Create a multiprocessing pool and apply the function to each chunk in parallel
    with multiprocessing.Pool(processes=num_chunks) as pool:
        results = pool.map(apply_function_to_chunk, chunks)

    # Concatenate the results back into a single DataFrame
    # replies_and_quotes['supports_source_tweet'] = pd.concat(results)
    replies_and_quotes.insert(replies_and_quotes.columns.get_loc('text') + 1, 'supports_source_tweet', pd.concat(results))

    return replies_and_quotes

PARALLEL EXECUTION. Benchmark tests (my machine): 1000 recordings => 1m11.1s

In [64]:
# test_replies_and_quotes_parallel = add_support_source_tweet_column_parallel(test_replies_and_quotes)

SEQUENTIAL EXECUTION. Benchmark tests (my machine): 1000 recordings => 2m30.0s

In [65]:
test_replies_and_quotes = replies_and_quotes.head(1000).copy()

In [66]:
# test_replies_and_quotes.insert(test_replies_and_quotes.columns.get_loc('text') + 1, 'supports_source_tweet', test_replies_and_quotes['text'].apply(supports_source_tweet))

CHECK IF RESULTS ARE THE SAME FOR BOTH METHODS

In [67]:
# test_replies_and_quotes_parallel.equals(test_replies_and_quotes)

CODE TO MODIFY THE ORIGINAL DATAFRAME (WITH ALL REPLIES AND QUOTES), WITH AN ADDED COLUMN NAMED 'supports_source_tweet', USING PARALLEL COMPUTATION, AS WELL AS SAVE IT TO A .CSV FILE (UNCOMMENT CELLS TO RUN)

NOTE: There are almost 1 million replies and quotes in the original dataframe, so the following statement is extremely time-consuming.

In [68]:
# replies_and_quotes = add_support_source_tweet_column_parallel(replies_and_quotes)

In [69]:
# path_to_replies_and_quotes = files_path + '/replies_and_quotes_modified.csv'

In [70]:
# # save the DataFrame to a CSV file
# replies_and_quotes.to_csv(path_to_replies_and_quotes, index=False)

In order to calculate the distribution of the tweets per hour, I will parse the "created_at" column, extract the hour property and create a separate column in each dataframe. I will place it next to the "created_at" column in order to be easily verifiable. Data originates frmo the Twitter API, so it comes in a standard ISO 8601 format, which can be easily parsed using the parser module from the dateutil package.

Note: the cell below runs for approximately 2m30' on my machine (~25-30 seconds for each file).

In [71]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         day.insert(1, 'hour', day['created_at'].apply(lambda date: parser.parse(date).hour))
#         print(f"New 'hour' column inserted in the {key} dataframe")

In [72]:
# for key, day in days.items():
#     if 'hour' not in day.columns:
#         hours = []
#         for time in day.loc[:,"created_at"]:
#             hour = parser.parse(time).hour
#             hours.append(hour)
#         day.insert(1, "hour", hours, True)
#         print(key + " - added 'hour' column")


The final distribution is made up of the sum of all individual days' distributions. I save a figure in the graphs/ folder for each day, as well as an overall distribution.

In [73]:
# final_distribution = pd.Series(0, index=days['1-3-2021'].loc[:,'hour'].sort_values(ascending=True).unique())
# for key, day in days.items():
#     hour_column_ascending = day.loc[:,"hour"].sort_values(ascending=True)
#     distribution = hour_column_ascending.value_counts()[hour_column_ascending.unique()]
#     final_distribution = final_distribution.add(distribution)
#     axes = distribution.plot(kind='bar')
#     figure_path = f"{covaxxy_longitudinal_analysis_graphs}/{key}_distribution.png"
#     axes.figure.savefig(figure_path)
#     plt.close()
# axes = final_distribution.plot(kind='bar')
# figure_path = f"{covaxxy_longitudinal_analysis_graphs}/overall_distribution.png"
# axes.figure.savefig(figure_path)
# plt.close()
