In [20]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import datetime
from numpy import nan, linspace
from math import floor

In [2]:
import os

In [3]:
path = "L:/bucket/2019_0361/raw"

In [5]:
lst = os.listdir(path)
lst

['irb_2019_0361_records_raw_20211004_012929.csv']

In [6]:
df = pd.read_csv(os.path.join(path, lst[0]))

In [23]:
df['yogtt009_screening_visit_data_collection_form_complete'] = df['yogtt009_screening_visit_data_collection_form_complete'].apply(pd.to_numeric, downcast='integer', errors='coerce')

In [9]:
def get_columns(df, substring):
    return [col for col in df.columns.to_list() if substring in col]

In [21]:
date_cols = get_columns(df, 'date')
df[date_cols] = df[date_cols].apply(pd.to_datetime)

In [4]:
# calculate total duration from earliest date to latest date
def get_date_duration(df, start: str, end: str):
    return (df['Finish'] - df['Start']).dt.days

In [20]:
# make base gantt chart
gantt = px.timeline(df, 
                    x_start="Start", 
                    x_end="Finish", 
                    y="Task", 
                    color_discrete_sequence=['black'],
                    hover_data={'Start': False, 'Finish': False, 'Task': False, 'Duration': True})

# change bar width
# change tooltip for bar
gantt.update_traces(
    width=0.025,
    hovertemplate="Duration: %{text} days",
    text=df['Duration']
)

# make figure which will be used for adding traces
fig = go.Figure(data=gantt.data, layout=gantt.layout)

In [39]:
def add_visit_trace_gantt(fig, df, x_name: str, y_name: str, trace_name: str = None, symbol_icon: str = 'circle', symbol_color: str = 'black', symbol_size=10):
    # marker documentation
    # https://plotly.com/python-api-reference/generated/plotly.graph_objects.scatter.html#plotly.graph_objects.scatter.Marker

    # note: <extra></extra> tag removes the part of the hover where the trace name is usually displayed in a contrasting color
    # source: https://plotly.com/python/hover-text-and-formatting/
    
    if trace_name is None:
        trace_name = x_name
    
    fig.add_trace(
        go.Scatter(
            x=df[x_name], 
            y=df[y_name],
            marker={
                'symbol': symbol_icon,
                'size': symbol_size,
                'color': symbol_color
            },
            mode='markers',
            hovertemplate="{trace_name}: %{{x}}<extra></extra>".format(trace_name=trace_name, x=''),
#             text=[trace_name],
            name=trace_name
            )
        )
    return fig

In [40]:
visits = ['Start', 'Finish', 'Midpoint', 'Early']
symbol_icons = ['circle', 'star-square', 'triangle-up', 'hexagram']
symbol_colors = ['black', 'darkviolet', 'pink', 'lightslategrey']

for idx, visit in enumerate(visits):
    fig = add_visit_trace_gantt(fig, df, visit, 'Task', symbol_icon=symbol_icons[idx], symbol_color=symbol_colors[idx])

In [42]:
# list tasks from top --> bottom
# show y grid lines
fig.update_yaxes(
    autorange="reversed",
    showgrid=True
)

# hide x grid line
fig.update_xaxes(
    showgrid=False
)

# add title, axis titles, legend title
fig.update_layout(
    title={
        'text': 'Enrollment Timeline',
        'x': 0.5,
        'y': 0.95,
        'xanchor': 'center',
        'yanchor': 'top'
      },
    xaxis_title={
        'text': 'Date',
#         'font': '',
#         'standoff': ''
            },
    yaxis_title={
        'text': 'Task',
#         'font': '',
#         'standoff': ''  
            },
    legend={
        'title': {
            'text':'Visits'
        }
    },
#     legend_title="Visits",
    showlegend=True
)

# fig.update_layout(hovermode='x unified')

fig.show()

In [43]:
# add vertical line indicating specific date (e.g. today)
def add_today_line(fig, orientation: str):
    # todo: change 'today' to datetime.date.today()
    today = date.today()
    fig.update_layout(shapes=[
        dict(
          type='line',
          yref='paper', y0=0, y1=1,
          xref='x', x0=today, x1=today
        )
    ])
    return fig

In [44]:
start = pd.Timestamp('2019-07-01')
end = pd.Timestamp('2022-06-30')
delta = (end - start).days

sample_size = 55
days_between_new_subject_enrollment = floor(delta / sample_size)

t = linspace(start.value, end.value, floor(delta/days_between_new_subject_enrollment) + 1)
t = pd.to_datetime(t).date

In [55]:
# plot for monitoring anticipated vs. real enrollment progress
identity_plot = go.Figure(
    data=go.Scatter(
        x=t,
        y=[n for n in range(1, len(t) + 1)],
        mode='lines+markers',
        line={
            'color': 'black',
#             'dash': 'dash'
            'shape': 'hv'
        },
        marker={
            'color': 'black',
            'size': 1.0
        }
        ,
        name='Predicted Progress'
    )
)
identity_plot.update_xaxes(showspikes=True)
identity_plot.update_yaxes(showspikes=True)

In [47]:
def visit_count_cumsum(df, visit: str) -> df:
    # sort the df based on the target visit column
    df = df.sort_values(by=visit)

    # assign a value of '1' for any start visit that had occurred
    # todo: do this for every visit
    mask = pd.isnull(df[visit])

    # inverting the mask as it flags 'NaN' as 'True'
    mask = mask.replace({True: 0, False: 1})
    df[f'{visit}_Count_Bool'] = mask

    # do a cumulative summation of events
    df[f'{visit}_Count'] = df[f'{visit}_Count_Bool'].cumsum()
    
    return df

In [56]:
for visit in visits:
    # calculate cumulative sum of visit as a fcn of date
    # visits was defined earlier
    df = visit_count_cumsum(df, visit)
    
    # add plot of visit occurrences
    identity_plot.add_trace(
        go.Scatter(
            x=df[visit],
            y=df[f'{visit}_Count'],
            mode='lines',
            line={
                'shape': 'hv'
            },
            name=visit
        )
    )
identity_plot = add_today_line(identity_plot, 'vertical')
identity_plot.show()