In [1]:
# third-party
import pandas as pd
import numpy as np

# local
from filepath import FILEPATH

In [2]:
overview = pd.read_excel(FILEPATH, sheet_name="overview")

# Get summary of Input Data

First we need to separate each input data entry (which may be in the form of "input1 and input2") into individual data inputs

In [3]:
df_input_by_year = pd.concat([overview["Year"], overview["Input data (description)"].str.split(" and ", expand=True)], axis=1).reindex(overview.index)
df_input_by_year


Unnamed: 0,Year,0,1,2,3,4,5,6
0,2021,EEG,seizure times,,,,,
1,2022,EEG cyclic profile,,,,,,
2,2013,EEG,,,,,,
3,2024,EEG,,,,,,
4,2022,EEG,,,,,,
5,2023,HR,,,,,,
6,2017,EEG,seizure cyclic profile,,,,,
7,2020,seizure cyclic profile,,,,,,
8,2022,IEA,seizure times,IEA cyclic profile,,,,
9,2020,EEG cyclic profile,,,,,,


From the expanded version of input data by year, we want to collapse it into a single column (from which we can get frequency counts), without losing the corresponding year information.

In [4]:
df_input_by_year_collapsed = df_input_by_year.melt(id_vars=["Year"], value_name="Input")
df_input_by_year_collapsed.dropna(inplace=True)
df_input_by_year_collapsed.drop('variable', axis=1, inplace=True)
df_input_by_year_collapsed


Unnamed: 0,Year,Input
0,2021,EEG
1,2022,EEG cyclic profile
2,2013,EEG
3,2024,EEG
4,2022,EEG
5,2023,HR
6,2017,EEG
7,2020,seizure cyclic profile
8,2022,IEA
9,2020,EEG cyclic profile


Now, we can finally count the frequency of occurrence of each input data in each year (using ".unstack(fill_value=0)" to also count zero occurrences).

In [5]:
df_input_frequency_by_year = df_input_by_year_collapsed.groupby(["Year", "Input"]).Input.count().unstack(fill_value=0).stack().reset_index()
df_input_frequency_by_year.rename(columns={0: "freq"}, inplace=True)
df_input_frequency_by_year['color'] = pd.factorize(df_input_frequency_by_year['Input'])[0]
df_input_frequency_by_year

Unnamed: 0,Year,Input,freq,color
0,2013,ACC,0,0
1,2013,BVP,0,1
2,2013,EDA,0,2
3,2013,EEG,1,3
4,2013,EEG cyclic profile,0,4
...,...,...,...,...
121,2024,seizure times,0,13
122,2024,signal quality metrics,0,14
123,2024,sleep,0,15
124,2024,sleep cyclic profile,0,16


## Bubble Chart 

In [6]:
import plotly.graph_objects as go
import plotly.express as px

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_input_frequency_by_year["Year"], y=df_input_frequency_by_year["Input"],
    marker_size=df_input_frequency_by_year['freq'],
    marker_color=df_input_frequency_by_year['color'],
    ))

fig.update_traces(mode='markers', marker=dict(sizemode='area',
                                              sizeref= 2.*max(df_input_frequency_by_year['freq'])/(100**2), line_width=2))

fig.update_layout(
    title='Data input for seizure forecasting algorithms across time',
    xaxis=dict(
        title='Year',
        gridcolor='white',
        type='log',
        gridwidth=2,
    ),
    yaxis=dict(
        title='Input data',
        gridcolor='white',
        gridwidth=2,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)
fig.update_yaxes(type='category', tickmode='linear')


fig.show()


## Line Chart

In [15]:
fig = px.line(df_input_frequency_by_year, x="Year", y="freq", color='Input')

fig.update_traces(line=dict(width=4))

fig.update_layout(
    title='Data input for seizure forecasting algorithms across time',
    xaxis=dict(
        title='Year',
        gridcolor='white',
        type='log',
        gridwidth=2,
    ),
    yaxis=dict(
        title='#Papers using Input',
        gridcolor='white',
        gridwidth=2,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)


fig.show()

## Horizontal Bar Chart

This chart will show the proportion of times each input was used according to the years. For that, we need to transform the data into proportions.

In [89]:
df_year_frequency_by_input = df_input_by_year_collapsed.groupby(["Input", "Year"]).Input.count().unstack(fill_value=0).stack().reset_index()
df_year_frequency_by_input.rename(columns={0: "freq"}, inplace=True)

x,y, labels = [], [], []
for input in df_year_frequency_by_input["Input"].unique():

    aux = df_year_frequency_by_input[df_year_frequency_by_input["Input"]==input]
    y += [input]
    x += [aux["freq"].tolist() / aux["freq"].sum()]
    labels += [aux["Year"].tolist()]

In [91]:
fig = go.Figure()

colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
          'rgba(190, 192, 213, 1)', 'rgba(190, 192, 213, 1)', 'rgba(190, 192, 213, 1)']

for i in range(0, len(x[0])):
    for xd, yd in zip(x, y):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    paper_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgb(248, 248, 255)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd, ld in zip(y, x, labels):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(yd),
                            font=dict(family='Arial', size=14,
                                      color='rgb(67, 67, 67)'),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text=str(ld),
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=labels[0],
                                font=dict(family='Arial', size=14,
                                          color='rgb(67, 67, 67)'),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    text=str(ld),
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))

            space += xd[i]

fig.update_layout(annotations=annotations)

fig.show()

ValueError: 
    Invalid value of type 'builtins.list' received for the 'text' property of layout.annotation
        Received value: [2013, 2017, 2020, 2021, 2022, 2023, 2024]

    The 'text' property is a string and must be specified as:
      - A string
      - A number that will be converted to a string