# Year in Pixels Visualizations

In [56]:
# imports
import string
import json
import datetime
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, to_rgba, Normalize
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import seaborn as sns

import matplotlib
matplotlib.use('module://mplcairo.macosx')

In [2]:
# load externally defined environment variables
from dotenv import dotenv_values
env_vars = dotenv_values()

In [3]:
# load color palette from environment variables
palette_list = [env_vars["COLOR_PALETTE_{}".format(i)] for i in range(1,6)]
palette_list

['ff2700', '88220d', '151518', '213788', '00bfff']

In [4]:
# data extraction into Pandas DataFrame
year = env_vars['YEAR']

with open('./data/data_{}.json'.format(year)) as fp:
    data = json.load(fp)
data = pd.DataFrame(data)
data = data.drop(columns=['type'])

In [5]:
# data cleaning
data['scores'] = data['scores'].apply(lambda x : int(x[0])) # convert scores from lists to integers
data['date'] = data['date'] = pd.to_datetime(data['date']) # convert dates from strings to datetime objects

# make a column for each tag
tags = {}
for index, row in data.iterrows():
    row_data = {}
    tags_list = row['tags']
    for tag in tags_list:
        if tag['type'] not in row_data:
            row_data = {**row_data, tag['type'] : tag['entries']}
        else:
            row_data[tag['type']].extend(tag['entries'])
    tags = {**tags, index : row_data}

tags = pd.DataFrame(tags).T

data = pd.concat([data, tags], axis=1).drop(columns=['tags'])

data['Productivity Rating'] = data['Productivity Rating'].apply(lambda x : int(x[0]) if isinstance(x, list) else 0) # convert productivity ratings from list to int

# Filter for this year's and last year's data
def check_year(year_to_check : datetime.date, reference : int):
    return True if year_to_check.year == reference else False

data_all = data

mask_previous_year = data_all['date'].apply(lambda x : check_year(x, int(year)-1))
data_previous_year = data_all[mask_previous_year]

mask_current_year = data_all['date'].apply(lambda x : check_year(x, int(year)))
data_current_year = data_all[mask_current_year]

In [6]:
# Font setting (change at will)
font = env_vars['FONT']

# Pixels (Github-like viz)

In [7]:
def generate_colormap_from_hex_list(palette_list : list[str]) -> LinearSegmentedColormap:
    """"""
    
    rgba_palette = [to_rgba('#'+color) for color in palette_list]
    cmap = LinearSegmentedColormap.from_list("custom_palette", rgba_palette, N=1024)

    return cmap

In [8]:
from matplotlib.patches import FancyBboxPatch

# Pixels viz function (each row is one week, squares are a little spaced out and rounded, month labels are visible)
def generate_pixels_heatmap(df : pd.DataFrame, palette : list[str], output_file : str, emoji : bool = False):
    """
    Generates a GitHub-style heatmap chart.

    Parameters:
        df (pd.DataFrame): DataFrame with 'date' (datetime) and 'scores' columns.
        palette (list): List of hexadecimal color strings defining the color palette.
        output_file (str): Path to save the resulting SVG image.

    Returns:
        None
    """

    assert 'emoji' in df.columns or not emoji, "Emoji condition not specified."

    # Ensure 'date' column is datetime
    df['date'] = pd.to_datetime(df['date'])

    # Create a full year of dates for the current year
    year = df['date'].dt.year.min()
    start_date = pd.Timestamp(f'{year}-01-01')
    end_date = pd.Timestamp(f'{year}-12-31')
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

    # Create a DataFrame for all days of the year
    all_days = pd.DataFrame({'date': all_dates})

    # Merge with the input DataFrame to align data
    df = all_days.merge(df, on='date', how='left').fillna({'rating': 0})
    data = df.copy()

    # Add week and day of the week for plotting
    df['week'] = df['date'].dt.isocalendar().week
    df['day_of_week'] = df['date'].dt.weekday

    # Handle the edge case where week 53 exists in the data
    if df['week'].max() == 53:
        df.loc[df['week'] == 53, 'week'] = 52

    # Ensure unique entries for pivot table by averaging ratings for duplicate days
    df = df.groupby(['week', 'day_of_week'], as_index=False)['scores'].mean()

    # Create a pivot table for heatmap data
    heatmap_data = df.pivot(index='day_of_week', columns='week', values='scores')

    # Create a custom colormap
    cmap = generate_colormap_from_hex_list(palette)

    # Plot the heatmap with rounded corners and spacing
    fig, ax = plt.subplots(figsize=(20, 10))
    ax.set_aspect('equal')
    
    # Define the size of each square and the spacing
    square_size = 1.0
    spacing = 0.2
    prop = FontProperties(fname='/System/Library/Fonts/Apple Color Emoji.ttc', size=10)

    # Draw each cell manually with rounded corners
    for y, (day, row) in enumerate(heatmap_data.iterrows()):
        for x, value in enumerate(row):
            if not np.isnan(value):
                color = cmap((value - 1.0) / 4.0)
                rect = FancyBboxPatch(
                    (x * (square_size + spacing), y * (square_size + spacing)),
                    square_size, square_size,
                    boxstyle=f"round,pad=0,rounding_size=0.2",
                    linewidth=0,
                    edgecolor=None,
                    facecolor=color
                )
                ax.add_patch(rect)
                that_day = data.iloc[y + x * 7, :]
                if emoji and that_day.emoji != '':
                    text = ax.annotate(that_day.emoji,
                                (x * (square_size + spacing) + 0.120, y * (square_size + spacing) + 0.9))
                    text.set(fontproperties=prop)

    # Add month labels at the top of the heatmap
    months = {'Jan' : 0,
              'Feb' : 4,
              'Mar' : 8,
              'Apr' : 13,
              'May' : 17,
              'Jun' : 21,
              'Jul' : 25,
              'Aug' : 30,
              'Sep' : 34,
              'Oct' : 38,
              'Nov' : 43,
              'Dec' : 47}
    for i, month in enumerate(months):
        ax.text(
            months[month] * (square_size + spacing),
            heatmap_data.shape[0] * (square_size + spacing) + 0.5,
            month,
            ha='left', va='bottom', color='white', fontsize=10, fontweight='bold', fontname=font
        )

    # Add weekday labels on the y-axis
    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    first_weekday = start_date.day_of_week
    for i, weekday in enumerate(weekdays):
        y_pos = ((first_weekday + i) % 7) * (square_size + spacing)
        if ((first_weekday + i) % 7) % 2 == 1 : continue
        ax.text(
            -0.5,
            y_pos,
            weekday,
            ha='right', va='top', color='white', fontsize=10, fontweight='bold', fontname=font
        )

    # Adjust the limits and remove axes
    ax.set_xlim(0, heatmap_data.shape[1] * (square_size + spacing))
    ax.set_ylim(0, heatmap_data.shape[0] * (square_size + spacing))
    ax.invert_yaxis()
    ax.axis('off')

    # TODO: add possibility to rotate 90° clockwise

    # Save the heatmap to a SVG file with a transparent background
    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [9]:
# Execution (2024)
generate_pixels_heatmap(data_current_year.copy(), palette_list, 'pixels_grid.svg')

# Average Rating YoY

In [10]:
# Avg rating viz function, YoY increments if existing
def plot_average_rating(data : pd.DataFrame, palette_list : list[str], output_file : str):
    """
    Plot a colorbar representing the ratings range and draw the average rating on it.

    Parameters:
        data (pd.DataFrame): DataFrame with 'scores' column.
        palette_list (list): List of hexadecimal color strings defining the color palette.
        output_file (str): Path to save the resulting SVG image.

    Returns:
        None.
    """
    # Define the ratings range
    ratings_range = (0, 5)

    # Calculate the average rating
    average_rating = data['scores'].mean()

    # Create a LinearSegmentedColormap from the palette
    cmap = generate_colormap_from_hex_list(palette_list)

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(10, 1))

    range_x_axis = np.linspace(ratings_range[0], ratings_range[1], len(palette_list)*256)

    # Draw the colorbar
    norm = Normalize(vmin=ratings_range[0], vmax=ratings_range[1])
    thinness = 20
    aspect = 1/thinness
    cb = ax.imshow(range_x_axis.reshape((1,len(palette_list)*256)), cmap=cmap, aspect=aspect, norm=norm, extent=[ratings_range[0], ratings_range[1],0, thinness])
    ax.set_yticks([])
    ax.set_xticks([])

    # TODO: Draw the rating milestones
    # milestones = range_x_axis
    # for milestone in milestones:
    #     ax.scatter(milestone * 256 / ratings_range[1], 0.5, color=cmap(norm(milestone)), s=800, edgecolors='black', zorder=20)

    # Draw the average rating indicator
    ax.axvline(average_rating, color='white', linewidth=5, zorder=10)
    ax.text(ratings_range[0] - 0.1, 0.5, f'{average_rating:.2f}', color='white', ha='right', va='bottom', fontsize=48, fontweight='bold', fontname=font)
    fig.set_facecolor('black')

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [11]:
# Show the plot
plot_average_rating(data_current_year.copy(), palette_list, 'avg_rating_curr_year.svg')

In [12]:
# Line chart of avg rating per year
def plot_ratings_per_year(data_all : pd.DataFrame, label : str, palette_list : list[str], output_file : str, what : str = 'mean'):
    """
    Plot a line chart with value averages per year.

    Parameters:
        data_all (pd.DataFrame): DataFrame containing timed ratings.
        label (str): The column of ratings to be averaged.
        palette_list (list): List of hexadecimal color strings defining the color palette.
        output_file (str): Path to save the resulting SVG image.

    Returns:
        None.
    """

    if label == 'emoji':
        data_all['emoji'] = data_all['emoji'].apply(lambda x : 1.0 if x != '' else 0.0)

    years = list(range(
        data_all.loc[0,['date']].to_numpy()[0].year,
        data_all.loc[data_all.shape[0]-1, ['date']].to_numpy()[0].year+1,
        1))

    avgs = []
    for year in years:
        filter = data_all['date'].apply(lambda x : check_year(x, year)) #TODO : check_year not needed anymore, substitute with data['date'].dt.year == year
        data_filtered = data_all[filter]

        if what == 'mean':
            avgs.append(data_filtered[label].mean())
        elif what == 'sum':
           avgs.append(data_filtered[label].sum()) 
        else:
            raise AttributeError(f'{what} is not an implemented aggregation.')

    fig, ax = plt.subplots(figsize=(5,4))

    for data_point in range(len(avgs)):
        ax.annotate(f'{avgs[data_point]:.2f}', (years[data_point]+0.1,avgs[data_point]), color='white', fontname=font, fontsize=15, fontweight='bold')
        if data_point > 0 :
            yoy_diff = avgs[data_point] - avgs[data_point - 1]
            sign = '' if yoy_diff < 0 else '+'
            color = '#'+palette_list[0] if yoy_diff < 0 else '#'+palette_list[-1]
            lateral_offset = years[data_point - 1] + 0.6
            vertical_offset = (avgs[data_point] + avgs[data_point - 1]) / 2.0
            ax.annotate(f'{sign}{yoy_diff:.2f}', (lateral_offset, vertical_offset), color=color, fontname=font, fontsize=12, fontweight='bold')
        ax.axvline(years[data_point], ymin=0.0, ymax=(avgs[data_point]-min(avgs))/(max(avgs)-min(avgs)) - 0.025, color='white', linestyle='--')

    ax.plot(years, avgs, 'o-', color='white', linewidth=3.5, markersize=10) #TODO : use sns.lineplot to obtain smoother curves
    ax.set_yticks([])
    ax.set_xticks(years, labels=years, fontname=font, fontweight='bold', fontsize=17, color='white')
    fig.set_facecolor('black')
    ax.set_facecolor('black')

    ax.set_frame_on(False)

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [13]:
# Execution (2024)
plot_ratings_per_year(data_all.copy(), 'scores', palette_list, 'avg_ratings_per_year.svg')

# Rating Frequency

In [14]:
# Rating frequency pie chart (with YoY increments if existing) viz function
def plot_rating_frequency_pie_chart(data_current_year : pd.DataFrame, palette_list : list[str], output_file : str, data_previous_year : pd.DataFrame = None):
    """"""

    # count rating frequencies
    rating_frequencies_current_year = data_current_year.groupby('scores').count()['date'].to_numpy(dtype=np.int32)

    # if YoY, compute previous year's measures
    if data_previous_year is not None:
        rating_frequencies_previous_year = data_previous_year.groupby('scores').count()['date'].to_numpy(dtype=np.int32)
        previous_year_pcts = rating_frequencies_previous_year / 365.0 * 100.0

    # plot pie chart
    fig, ax = plt.subplots(figsize=(5,5))

    # adjust palette
    palette_list = ['#'+color for color in palette_list]

    # use emojis as labels
    prop = FontProperties(fname='/System/Library/Fonts/Apple Color Emoji.ttc', size=23)

    # set properties of percentage text labels
    textprops = {
        'fontsize' : 10,
        'fontname' : font,
        'color' : 'white'
    }

    # draw pie chart
    _, texts, autotexts = ax.pie(x=rating_frequencies_current_year,
           labeldistance=0.90,
           labels=['😭', '😢', '😐', '🙂', '😁'],
           colors=palette_list,
           autopct=f"%.2f%%",
           textprops=textprops)
    
    for text in texts:
        text.set_fontproperties(prop)

    for t in range(len(autotexts)):
        if data_previous_year is None:
            autotexts[t].set_text(autotexts[t].get_text() + f" ({rating_frequencies_current_year[t]})")
        else:
            pct_difference = float(autotexts[t].get_text()[:-1]) - (previous_year_pcts[t])
            sign = '+' if pct_difference > 0 else ''
            autotexts[t].set_text(autotexts[t].get_text() + f" ({sign}{pct_difference:.2f})")

    fig.set_facecolor('black')
    ax.set_facecolor('black')

    ax.set_frame_on(False)

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [15]:
# Execution (2024)
plot_rating_frequency_pie_chart(data_current_year.copy(), palette_list, 'rating_frequency_pie_chart.svg', data_previous_year)

# Avg Rating per Weekday

In [16]:
# Function to make rounded bars in bar plots
def round_bars(ax):
    """"""
    new_patches = []
    for patch in reversed(ax.patches):
        bb = patch.get_bbox()
        color = patch.get_facecolor()
        
        # Create a FancyBboxPatch with rounded corners
        p_bbox = FancyBboxPatch(
            (bb.xmin, bb.ymin),
            abs(bb.width),
            abs(bb.height),
            boxstyle="round,pad=0,rounding_size=0.020",
            ec="none", fc=color,
            mutation_aspect=4
        )
        
        # Remove the old patch and add the new one
        patch.remove()
        new_patches.append(p_bbox)

    # Add all new patches to the axes
    for patch in new_patches:
        ax.add_patch(patch)

In [17]:
# (Normalized) Avg rating/productivity level per weekday/tag viz function, with YoY increments if existing
def plot_avg_ratings_per_weekday(data_all : pd.DataFrame, label : str, palette_list : list[str], output_file : str):
    """
    Plot a bar chart with value averages per weekday.

    Parameters:
        data_all (pd.DataFrame): DataFrame containing timed ratings.
        label (str): The column of ratings to be averaged.
        palette_list (list): List of hexadecimal color strings defining the color palette.
        output_file (str): Path to save the resulting SVG image.

    Returns:
        None.
    """

    assert label in data_all.columns, f"No column named '{label}'"
    # assert data_all[label].dtype not a string and not a list

    curr_year = data_all.loc[data_all.shape[0]-1, 'date'].year

    total_avg = 0

    yearly_avgs = []
    for year in range (curr_year-1, curr_year+1):
        mask_year = data_all['date'].apply(lambda x : check_year(x, year))
        data_year = data_all[mask_year]
        data_year['weekday'] = data_year['date'].apply(lambda x : datetime.date.weekday(x))
        yearly_avgs.append(data_year.groupby('weekday', axis=0)[label].mean().to_numpy())
        if year == curr_year:
            total_avg = float(data_year[label].mean())

    fig, ax = plt.subplots(figsize=(9,4))

    yoy_diffs = [yearly_avgs[-1][i] - yearly_avgs[0][i] for i in range(7)]
    for j in range(7):
        yoy_diffs[j] = '+'+f'{yoy_diffs[j]:.2f}' if yoy_diffs[j] > 0 else f'{yoy_diffs[j]:.2f}'
        if '0.00' in yoy_diffs[j] : yoy_diffs[j] = '='

    container = ax.bar(range(7), yearly_avgs[-1], color='white', width=0.3)
    ax.bar_label(container, labels=[f"{yearly_avgs[-1][i]:.2f}({yoy_diffs[i]})" for i in range(7)], fontname=font, fontsize=11, color='white', padding=18)
    round_bars(ax)
    ax.axhline(total_avg, xmin=0.04, color='#'+palette_list[0], linestyle='--')
    ax.annotate('AVG', (-0.45, total_avg), fontname=font, fontsize=8, color='#'+palette_list[0], va='center')
    ax.set_yticks([])
    ax.set_xticks(range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], fontname=font, fontweight='bold', fontsize=17, color='white')
    ax.set_ylim(0, 5)
    fig.set_facecolor('black')
    ax.set_facecolor('black')

    ax.set_frame_on(False)

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [18]:
# Execution (2024, rating)
plot_avg_ratings_per_weekday(data_all.copy(), 'scores', palette_list, 'avg_ratings_per_weekday.svg')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_year['weekday'] = data_year['date'].apply(lambda x : datetime.date.weekday(x))
  yearly_avgs.append(data_year.groupby('weekday', axis=0)[label].mean().to_numpy())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_year['weekday'] = data_year['date'].apply(lambda x : datetime.date.weekday(x))
  yearly_avgs.append(data_year.groupby('weekday', axis=0)[label].mean().to_numpy())


# Emotion Frequency

In [19]:
def tag_list(data : pd.DataFrame, tag : str):
    # collect all possible tag values
    masked_data = data[data[tag].notna()] # masked DataFrame
    tag_values = masked_data[tag].apply(lambda x : [x] if not isinstance(x, list) else x).sum() # list with duplicates
    tag_values = list(dict.fromkeys(tag_values)) # remove duplicates

    return tag_values

In [20]:
# Tag frequency viz function (descending sort by frequency)
def plot_tag_frequency(data_current_year : pd.DataFrame, tag_category : str, palette_list : list[str], output_file : str):
    """
    Plot a horizontal bar chart with tags for a certain category, ordered by frequency.

    Parameters:
        data_current_year (pd.DataFrame): DataFrame containing one list of 'tag_category' tags per day.
        tag_category (str): The category of tags to count.
        palette_list (list): List of hexadecimal color strings defining the color palette.
        output_file (str): Path to save the resulting SVG image.

    Returns:
        None.
    """

    assert tag_category in data_current_year.columns, f"No tag category called {tag_category}"

    # collect all possible tag values
    tag_values = tag_list(data_current_year, tag_category)

    # count tag appearances
    tag_counters = {tag : 0 for tag in tag_values}
    for index, row in data_current_year[data_current_year[tag_category].notna()].iterrows(): # iterate over masked DataFrame to avoid null values
        tags_of_the_day = list(row[tag_category])
        for tag in tags_of_the_day:
            tag_counters[tag] += 1

    # sort for number of appearances (descending)
    tag_counters = pd.DataFrame(tag_counters.values(), index=tag_counters.keys(), columns=['count'])
    tag_counters.sort_values(by='count', ascending=True, inplace=True) # plotting function will flip the order anyways

    # draw horizontal bar chart
    fig, ax = plt.subplots(figsize=(9,4)) 

    data = tag_counters['count'].to_numpy(dtype=np.int32)

    container = ax.barh(y=range(0,len(tag_counters.index)), data=data, width=data, height=0.5, color='white')
    # round_bars(ax)
    ax.bar_label(container, data, color='white', fontname=font, fontsize=9, padding=6)

    ax.set_yticks(range(0,len(tag_counters.index)), labels=tag_counters.index, color='white', fontname=font, fontsize=11)
    ax.set_xticks([])
    fig.set_facecolor('black')
    ax.set_facecolor('black')

    ax.set_frame_on(False)

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()    

In [21]:
# Execution (2024, emotions)
plot_tag_frequency(data_current_year.copy(), 'Emotions', palette_list, 'emotion_frequency.svg')

# Activity Frequency

In [22]:
# Execution (2024, activities)
plot_tag_frequency(data_current_year.copy(), 'Activities', palette_list, 'activity_frequency.svg')

# Activity-Emotion Correlation

In [23]:
# Pairwise tag-tag correlation (Pearson) heatmap function
def plot_tag_to_tag_correlation_heatmap(data_current_year : pd.DataFrame, tag_1 : str, tag_2 : str, palette_list : list[str], output_file : str):
    """"""

    # compute tag-2-tag correlation
    # collect possible tag values
    tag_keys = tag_list(data_current_year, tag_1)
    tag_columns = tag_list(data_current_year, tag_2) if tag_2 is not None else tag_keys

    tag_keys = sorted(tag_keys) # ordered alphabetically (for now)
    tag_columns = sorted(tag_columns)

    df_columns = tag_keys + tag_columns

    one_hot_encoded_rows = []
    for index, row in data_current_year.iterrows():
        tag_1_values = list(row[tag_1]) if not isinstance(row[tag_1], float) else []
        if tag_2 is not None : tag_2_values = list(row[tag_2]) if not isinstance(row[tag_2], float) else []

        tag_1_one_hot_encoding = [1 if x in tag_1_values else 0 for x in tag_keys]
        if tag_2 is not None : tag_2_one_hot_encoding = [1 if y in tag_2_values else 0 for y in tag_columns]

        if tag_2 is not None : one_hot_encoded_rows.append(tag_1_one_hot_encoding + tag_2_one_hot_encoding)
        else : one_hot_encoded_rows.append([tag_1_one_hot_encoding])

    one_hot_encoded_rows = pd.DataFrame(one_hot_encoded_rows, index=data_current_year.index, columns=df_columns)

    tag_table = pd.concat([data_current_year['date'], one_hot_encoded_rows], axis=1)

    # TODO : order rows by number of appearances
    correlations = tag_table.corr(method='pearson')

    self_correlations = correlations.loc[tag_keys, tag_keys] # could need for some heatmap in the future?
    ethero_correlations = correlations.loc[tag_keys, tag_columns]

    correlations_to_plot = ethero_correlations if tag_2 is not None else self_correlations

    # Create a LinearSegmentedColormap from the palette
    cmap = generate_colormap_from_hex_list(palette_list)

    fig, ax = plt.subplots(figsize=(10,10))
    
    # draw 2d heatmap
    ax = sns.heatmap(correlations_to_plot, center=0.0, cmap=cmap, square=True, cbar=False)

    # TODO: (all notebook) make ticks invisible, but not labels
    cbar = ax.figure.colorbar(ax.collections[0], shrink=0.63)
    cbar.set_ticks(cbar.get_ticks()[1:-1], labels=cbar.get_ticks()[1:-1], fontname=font, color='white')
    ax.set_yticks(range(0,len(tag_keys)), labels=['\n'+key for key in tag_keys], fontname=font, color='white', va='top')
    ax.set_xticks(range(0,len(tag_columns)), labels=['\n'+col for col in tag_columns], fontname=font, color='white', ha='left')
    ax.set_facecolor('black')
    fig.set_facecolor('black')

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=0, transparent=True)
    plt.close()

In [24]:
# Execution (2024, activity v. emotion)
plot_tag_to_tag_correlation_heatmap(data_current_year.copy(), 'Activities', 'Emotions', palette_list, 'activity_emotion_correlation.svg')

# Location-Emotion Correlation

In [25]:
# Execution (2024, location v. emotion)
plot_tag_to_tag_correlation_heatmap(data_current_year.copy(), 'Location', 'Emotions', palette_list, 'location_emotion_correlation.svg')

# Sick Days 🤒

In [26]:
def sick_day(row : pd.DataFrame) -> str:
    assert set(['scores', 'Symptoms', 'Medication']).issubset(set(row.index)), "Wrong columns."
    if isinstance(row['Symptoms'], float) or isinstance(row['Medication'], float) : return ''
    if len(row['Symptoms'] + row['Medication']) > 4.0:
        if row['scores'] < 2.0 : return '💀'
        else : return '🤒'
    return ''

In [27]:
# Number of symptoms + number of pharmaceuticals calendar viz function, ratings in semi-transparency, skull 💀 emoji if bad day + sick day
data_sick_days = data_all.copy()
# data_sick_days
data_sick_days['emoji'] = data_sick_days.apply(sick_day, axis=1)
data_sick_days_current_year = data_sick_days[data_sick_days['date'].dt.year == 2024]

generate_pixels_heatmap(data_sick_days_current_year, palette_list, 'sick_days_heatmap.svg', True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'])


In [28]:
# Total number of sick (💀) days per year line chart
plot_ratings_per_year(data_sick_days, 'emoji', palette_list, 'sick_days_per_year.svg', what='sum')

# Productivity

In [29]:
# Productivity calendar viz function, ratings in semi-transparency
def productive_day(row : pd.DataFrame) -> str :
    assert set(['Productivity Rating']).issubset(set(row.index)), "Wrong columns."
    if row['Productivity Rating'] >= 4.0 : return '💪'
    elif row['Productivity Rating'] == 0.0 : return '😴'
    else : return ''

In [30]:
# Average productivity per year line chart
plot_ratings_per_year(data_all.copy(), 'Productivity Rating', palette_list, 'avg_productivity_per_year.svg')

In [31]:
# Avg productivity per weekday execution (2024)
plot_avg_ratings_per_weekday(data_all.copy(), 'Productivity Rating', palette_list, 'avg_productivity_per_weekday.svg')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_year['weekday'] = data_year['date'].apply(lambda x : datetime.date.weekday(x))
  yearly_avgs.append(data_year.groupby('weekday', axis=0)[label].mean().to_numpy())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_year['weekday'] = data_year['date'].apply(lambda x : datetime.date.weekday(x))
  yearly_avgs.append(data_year.groupby('weekday', axis=0)[label].mean().to_numpy())


In [32]:
# Execution (2024)
data_productive_days = data_current_year.copy()

data_productive_days['emoji'] = data_productive_days.apply(productive_day, axis=1)

generate_pixels_heatmap(data_productive_days, palette_list, 'productive_days_heatmap.svg', emoji=True)

# Weighted Avg Rating per Tag

In [33]:
def _in(contained, container) -> bool:
    if isinstance(container, list) : return contained in container
    else : return contained == container

In [34]:
# TODO: YoY?
# (Normalized) avg rating per tag viz function
def plot_avg_rating_per_tag(data_current_year : pd.DataFrame, label : str, palette_list : list[str], output_file : str, shrink : float = 0.0):
    """"""

    assert label in data_current_year.columns, f"No column named '{label}'"
    # assert data_all[label].dtype not a string and not a list

    curr_year = data_current_year.loc[data_all.shape[0]-1, 'date'].year
    tag_values = tag_list(data_current_year, label)

    tag_values = sorted(tag_values)

    total_avg = 0

    yearly_avgs = []
    for tag in tag_values:
        data_current_year[str(tag)] = data_current_year[label].apply(lambda x : 1 if _in(tag, x) else 0)
        data_to_avg = data_current_year[data_current_year[str(tag)] == 1]
        yearly_avgs.append(data_to_avg['scores'].sum() / (data_to_avg.shape[0] + shrink)) # shrunk mean (penalize values with few appearances)
    
    total_avg = float(data_current_year['scores'].mean())

    fig, ax = plt.subplots(figsize=(9,4))

    # TODO: maybe for the future?
    # yoy_diffs = [yearly_avgs[-1][i] - yearly_avgs[0][i] for i in range(7)]
    # for j in range(7):
    #     yoy_diffs[j] = '+'+f'{yoy_diffs[j]:.2f}' if yoy_diffs[j] > 0 else f'{yoy_diffs[j]:.2f}'
    #     if '0.00' in yoy_diffs[j] : yoy_diffs[j] = '='

    # TODO: sort? Alphabetical for now

    container = ax.bar(range(len(tag_values)), yearly_avgs, color='white', width=0.3)
    ax.bar_label(container, labels=[f"{yearly_avgs[i]:.2f}" for i in range(len(tag_values))], fontname=font, fontsize=11, color='white', padding=18) #({yoy_diffs[i]})
    round_bars(ax)
    ax.axhline(total_avg, xmin=0.04, color='#'+palette_list[0], linestyle='--')
    ax.annotate('AVG', (len(tag_values)-0.65, total_avg+0.1), fontname=font, fontsize=8, color='#'+palette_list[0], va='center')
    ax.set_yticks([])

    rotation = 90 if len(tag_values) > 6 else 0 # I don't want productivity ratings rotated

    ax.set_xticks(range(len(tag_values)), labels=tag_values, fontname=font, fontweight='bold', fontsize=17, color='white', rotation=rotation)
    ax.set_ylim(0, 5)
    fig.set_facecolor('black')
    ax.set_facecolor('black')

    ax.set_frame_on(False)

    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=1.0, transparent=True)
    plt.close()

In [35]:
# (Normalized) Avg rating per productivity level execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Productivity Rating', palette_list, 'avg_rating_per_productivity.svg')

In [36]:
# (Normalized) Avg rating per location execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Location', palette_list, 'avg_rating_per_location.svg', shrink=0.666)

In [37]:
# (Normalized) Avg rating per emotion execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Emotions', palette_list, 'avg_rating_per_emotion.svg', shrink=0.666)

In [38]:
# (Normalized) Avg rating per activity execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Activities', palette_list, 'avg_rating_per_activity.svg', shrink=0.666)

In [39]:
# (Normalized) Avg rating per medication execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Medication', palette_list, 'avg_rating_per_medication.svg')

In [40]:
# (Normalized) Avg rating per symptom execution (2024)
plot_avg_rating_per_tag(data_current_year.copy(), 'Symptoms', palette_list, 'avg_rating_per_symptom.svg', shrink=0.666)

# Locations Visited 🛫

In [41]:
# 3D Map with visit frequency and average rating per location viz function

In [42]:
# Execution (2024)

# Word Clouds ☁️

In [128]:
additional_stopwords = [
    "i'm",
    "it's",
    "kinda",
    "said",
    "overall",
    "anyways"
]

stopwords = set(STOPWORDS)

for word in additional_stopwords:
    stopwords.add(word)

In [124]:
def term_list(document_corpus : pd.DataFrame, label : str) -> list[str]:
    """"""

    document_corpus[label] = document_corpus[label].apply(lambda x : [y.strip(string.punctuation) for y in x.strip().lower().split()])
    terms = tag_list(document_corpus, label)
    terms.remove('')
    return terms

In [119]:
def tf_idf(document_corpus : pd.DataFrame, label : str) -> dict:
    """"""

    assert label in document_corpus.columns, f"{label} is not a valid column."
    
    # convert column dtype to string
    document_corpus[label] = document_corpus[label].apply(lambda x : str(x))

    # compute list of possible terms (avoiding STOPWORDS)
    terms = term_list(document_corpus, label)
    terms = list(set(terms) - STOPWORDS)


    # compute term frequency and document frequency for each term
    tf = []
    df = {term : 0 for term in terms}
    for index, row in document_corpus.iterrows():
        previous_df = deepcopy(df)
        for term in row[label]:
            if term in df.keys() : df[term] += 1 
        tf_row = {term : df[term] - previous_df[term] for term in df}
        tf.append(tf_row)

    tf = pd.DataFrame(tf, dtype="Sparse[int]")

    # TF-IDF = tf(i,j) * log(N/df(i))
    tf_idf = tf.apply(lambda x : x * np.log(tf.shape[0] / df[x.name]), axis=0)

    return tf_idf.mean().to_dict()
        

In [120]:
# Word cloud per daily notes (filtered by rating) viz function
def plot_wordcloud_from_daily_notes(data : pd.DataFrame, palette_list : list[str], output_file : str, ratings_range : tuple[int] = (0,6), img_path : str = './face-smile-solid.png', use_tf_idf : bool = True):
    """"""

    # Filter data to include only the ratings specified in ratings_range
    assert len(ratings_range) == 2, "Ratings range is required to have length equal to 2."
    filtered_data = data[(data['scores'] > ratings_range[0]) & (data['scores'] < ratings_range[1])]
    
    mask = np.array(Image.open(img_path))

    if len(mask.shape) == 3 : # BW channels. Need only one
        mask = np.array([[j[1] for j in i] for i in mask])
    mask = np.abs(255-mask) # black -> white, white/transparent -> black
    
    cmap = generate_colormap_from_hex_list(palette_list)

    # Create wordcloud
    wc = WordCloud(mask=mask, colormap=cmap, background_color=None, mode='RGBA', stopwords=STOPWORDS, collocations=False)
    if tf_idf: # Compute TF-IDF word importance values
        term_importances = tf_idf(filtered_data, 'notes')
        wc.generate_from_frequencies(term_importances)
    else: # generate from simple term frequencies
        text = filtered_data['notes'].sum()
        wc.generate(text)


    plt.figure(figsize=(10, 10))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    
    plt.savefig(output_file, dpi=600, bbox_inches='tight', pad_inches=1.0, transparent=True)
    plt.close()


In [129]:
# Execution (2024, all days)
plot_wordcloud_from_daily_notes(data_current_year.copy(), palette_list, 'wordcloud_all_days.svg')

In [110]:
# Execution (2024, bad days: rating 1,2)
plot_wordcloud_from_daily_notes(data_current_year.copy(), palette_list, 'wordcloud_bad_days.svg', (0,3), './face-sad-cry-solid.png')

In [99]:
# Execution (2024, good days: rating 4,5)
plot_wordcloud_from_daily_notes(data_current_year.copy(), palette_list, 'wordcloud_good_days.svg', (4,6))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_corpus[label] = document_corpus[label].apply(lambda x : str(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document_corpus[label] = document_corpus[label].apply(lambda x : [y.strip(string.punctuation) for y in x.strip().split()])
