###Tuesday MeetUp 053 - Beginners Python and Machine Learning - 31st Mar 2020 - Charting COVID-19 doubling rate with plotly

Learning objectives:
- pandas DataFrames and Series
- plotly.py

@author D Tim Cummings



In [0]:
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px

In [0]:
# https://plotly.com/python/creating-and-updating-figures/

# plotly.py is a library for sending JSON objects to plotly.js
# at a low level we can create a dict and send it straight to plotly.js
fig = {
    "data": [{"type": "scatter", "x": [1, 2, 3], "y": [1, 3, 2], "name":"up down"},
             {"type": "scatter", "x": [1, 2, 4], "y": [1, 2.5, 3.5], "name":"climber"}
             ],
    "layout": {"title": {"text": "Scatter chart constructed as a dict"}}
}
# The method in the next line works out we are using colab and uses a colab renderer to 
# implement plotly.js in colab and display our interactive chart (try hovering and clicking)
plotly.io.show(fig)
# If you are not using interactive python you can create an html file and open it
plotly.io.write_html(fig, "fig1.html")

In [0]:
# Challenge 1: Given the following lists of x and y values which represent sigmoid function
# Plot the values in a scatter chart using dict 
x = np.linspace(-10, 10, 21) 
y = 1/(1 + np.exp(-x)) 
print(x)
print(y)

In [0]:
fig = {
    "data": [{"type": "scatter", "x": x, "y": y, "name":"sigmoid", "showlegend": True}
             
             ],
    "layout": {"title": {"text": "Sigmoid function constructed as a dict"}}
}
plotly.io.show(fig)

In [0]:
# Solution to challenge 1
# if you are only showing one item in data list, you need to explicitly show the legend
fig = {
    "data": [{"type": "scatter", "x": x, "y": y, "name":"sigmoid", "showlegend": True} ],
    "layout": {"title": {"text": "Sigmoid function constructed as a dict"}}
}
plotly.io.show(fig)

In [0]:
# At a higher level we can use plotly graph_objects which have a built-in validation 
fig = go.Figure(
    data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2], name="blue boxes", showlegend=True,)],
    layout=go.Layout(
        title=go.layout.Title(text="Bar chart constructed using graph objects")
    )
)
# Figure has a "write_html()" method for those not using interactive python
fig.write_html("fig2.html")
# In Google colab we can call "show()" method
fig.show()

In [0]:
# print fig to see how graph_objects are converted to a dict
print(fig)

In [0]:
# use to_dict() to see the full dictionary
fig.to_dict()

In [0]:
# Challenge 2: Using x and y from Challenge 1 plot the values using graph_objects 

In [0]:
fig = go.Figure(
    data=[go.Scatter(x=x, y=y, name="sigmoid", showlegend=True,)],
    layout=go.Layout(
        title=go.layout.Title(text="Sigmoid constructed using graph objects")
    )
)
fig.show()

In [0]:
# Solution to challenge 2
# Also demonstrates how to use lines or markers or both and how to set marker symbols
fig = go.Figure(
    data=[go.Scatter(x=x, y=y, name="sigmoid", showlegend=True, mode="lines+markers", marker_symbol="hash-dot", marker_line_width=1, marker_size=15)],
    layout=go.Layout(
        title=go.layout.Title(text="Sigmoid function constructed from graph_object")
    )
)
# see the created dict. Notice what happens with marker_line_width
print(fig)
fig.show()

In [0]:
# How to see all markers available
# https://plotly.com/python/marker-style/
raw_symbols = plotly.validators.scatter.marker.SymbolValidator().values
namestems = []
namevariants = []
symbols = []
for i in range(0,len(raw_symbols),2):
  name = raw_symbols[i+1]
  symbols.append(raw_symbols[i])
  namestems.append(name.replace("-open", "").replace("-dot", ""))
  namevariants.append(name[len(namestems[-1]):])
    
fig = go.Figure(go.Scatter(mode="markers", x=namevariants, y=namestems, marker_symbol=symbols,
                           marker_line_color="midnightblue", marker_color="lightskyblue", 
                           marker_line_width=2, marker_size=15, 
                           hovertemplate="name: %{y}%{x}<br>number: %{marker.symbol}<extra></extra>"))
fig.update_layout(title="Mouse over symbols for name & number!",
                  xaxis_range=[-1,4], yaxis_range=[len(set(namestems)),-1], 
                  margin=dict(b=0,r=0), xaxis_side="top", height=1200, width=400)
fig.show()

In [0]:
# plotly express is higher level api designed for data exploration
df1 = pd.DataFrame(data={"id": [1, 2, 3], "score": [1, 3, 2], "group": ["up-down"] * 3})
df2 = pd.DataFrame(data={"id": [1, 2, 4], "score": [1, 2, 5], "group": ["climber"] * 3})
df = pd.concat([df1, df2])
print(df)
px.line(df, x="id", y="score", color="group")

In [0]:
# Challenge 3: Using x and y from Challenge 1 plot the values using plotly express

In [0]:
df = pd.DataFrame(data={"sigmoid": y}, index=x)
df['fn'] = 'sigmoid'
print(df.head())
px.line(df, y="sigmoid", title="Sigmoid function constructed from plotly express", color='fn')

In [0]:
# Solution to challenge 3
df = pd.DataFrame(data=y, index=x, columns=["sigmoid"])
df['fn'] = 'sigmoid'
print(df.head())
# px.line and px.scatter use dataframe index on x axis by default
fig = px.line(df, y="sigmoid", title="Sigmoid function constructed from plotly express", color='fn')
fig  # same as fig.show() for colab and jupyter notebooks

In [0]:
# plotly express comes with some default dataframes if you want to practise
df = px.data.gapminder().query("continent=='Oceania'")
px.line(df, x="year", y="lifeExp", color='country')

In [0]:
# Load data from the covid-19 data set. NB URL has been changed on Github. Now ends in "_global"
df_confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df_confirmed.head()

In [0]:
df_confirmed['Province/State'].dropna().unique()

In [0]:
# List the states in alphabetical order. Need to drop NaN because otherwise sort will break
ar_state = df_confirmed['Province/State'].dropna().unique()
ar_state.sort()
print(type(ar_state))
print(ar_state)

In [0]:
ar_state[::-1]

In [0]:
# Challenge 4: Print the list of countries in alphabetical order with no repeats

In [0]:
# Solution to challenge 4: Don't need to dropna because there are none. No harm if left in apart from slightly slower
ar_country = df_confirmed['Country/Region'].unique()
ar_country.sort()
print(ar_country)

In [0]:
# how to filter by country
df_confirmed[df_confirmed["Country/Region"]=="Australia"]

In [0]:
# how to filter by state
df_confirmed[df_confirmed["Province/State"] == "Queensland"]

In [0]:
# how to filter by state and country should two countries have state with same name
df_confirmed[(df_confirmed["Province/State"]=="Queensland") & (df_confirmed["Country/Region"]=="Australia")]

In [0]:
# Challenge 5: define a function which takes 3 arguments, df, country, state and 
# will return the dataframe filtered by country, by state or by country and state

In [0]:
def df_for_location(df, country=None, state=None):
  if country:
    if state:
      return df[(df["Province/State"]==state) & (df["Country/Region"]==country)]
    else:
      return df[(df["Country/Region"]==country)]
  else:
    if state:
      return df[(df["Province/State"]==state)]
    else:
      return df

df_for_location(df_confirmed, state="Queensland")

In [0]:
# Solution to challenge 5
def df_for_location(df, country=None, state=None):
  filt = [True] * df.shape[0]
  if country:
    filt = filt & (df["Country/Region"] == country)
  if state:
    filt = filt & (df["Province/State"] == state)
  return df[filt]

# check it works
df_for_location(df_confirmed, country="New Zealand")

In [0]:
df_confirmed_by_location = df_for_location(df_confirmed, country="Australia")
df_confirmed_by_location

In [0]:
# Sum the rows in the dataframe and return a series
total=df_confirmed_by_location.sum(axis="index")
print(type(total))
total

In [0]:
# Don't want the first four rows so can slice
series_sum = total[4:]
series_sum

In [0]:
# index is currently str but would prefer datetime
print(series_sum.index)
series_sum.index = pd.to_datetime(series_sum.index)
print(series_sum.index)

In [0]:
# Challenge 6: define a function which returns a series of values for a given df and country and/or state 
# Index for series should be a DateTimeIndex

In [0]:
def series_sum_for_location(df, country=None, state=None):
  df = df_for_location(df, country=country, state=state)
  series_sum = df.sum(axis='index')[4:]
  series_sum.index = pd.to_datetime(series_sum.index)
  return series_sum

series_sum_for_location(df_confirmed, country="Australia")

In [0]:
# Solution to challenge 6
def series_sum_for_location(df, country=None, state=None):
  df = df_for_location(df=df, state=state, country=country)
  series = df.sum(axis="index")[4:].astype(int)
  series.index = pd.to_datetime(series.index)
  return series

# check it works
series_sum_for_location(df_confirmed, country="Australia")

In [0]:
# Challenge 7: Define a function location_name which takes country and/or state and 
# returns a name for that location
# location_name(country="Australia") should return "Australia"
# location_name(state="Queensland") should return "Queensland"
# location_name() should return "everywhere"
# location_name(country="Australia", state="Queensland") should return "Queensland - Australia"

In [0]:
def location_name(country=None, state=None):
  if country:
    if state:
      return f"{state} - {country}"
    else:
      return f"{country}"
  else:
    if state:
      return f"{state}"
    else:
      return "everywhere"

location_name(country="Australia", state="Queensland")

In [0]:
# Solution to challenge 7:
def location_name(country=None, state=None):
  locations = []
  if state:
    locations.append(state)
  if country:
    locations.append(country)
  return " - ".join(locations) if len(locations) > 0 else "everywhere"

In [0]:
# To get the index for values greater than a starting value (in this case 100) use a filter
series_sum = series_sum_for_location(df_confirmed, country="Australia")
print(series_sum.index[series_sum>=100])

In [0]:
# To convert to dataframe and label the data column 'current'
df_sum = pd.DataFrame(series_sum, columns=['current'])
df_sum

In [0]:
# Challenge 8: Find the date the number of confirmed cases exceeded 100 and what the count was on that day

In [0]:
# Solution to Challenge 8
idx_start = series_sum.index[series_sum>=100][0]
num_start_actual = series_sum[idx_start]
# idx_start = df_sum.index[df_sum['current']>=100][0]
# num_start_actual = df_sum.loc[idx_start, 'current']
print(idx_start, num_start_actual)

In [0]:
# Slice the Dataframe to only include those records greater than 100
df_plot = df_sum.loc[idx_start:]
df_plot

In [0]:
# Challenge 9: Write a function to plot those numbers after starting count reached. Default starting count=100
# Advanced: plot count on a logarithmic scale

In [0]:
# Solution to challenge 9:
def plot_for_location(df, country=None, state=None, num_start=100, description=""):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=100][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_plot = df_sum.loc[idx_start:]
  location = location_name(country=country, state=state)
  fig = go.Figure(
    data=[go.Scatter(x=df_plot.index, y=df_plot['current'], name=location, showlegend=True, mode="lines")],
    layout=go.Layout(
        title=go.layout.Title(text=f"{description} cases for {location} starting from {num_start}"),
    )
  )
  fig.update_yaxes(type='linear')
  return fig

plot_for_location(df_confirmed, description="Confirmed")

In [0]:
# Calculate the average doubling rate over the last 3 days
averaged_days = 3
idx_start = series_sum.index[series_sum>=100][0]
df_sum = pd.DataFrame(series_sum, columns=['current'])
df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
df_plot = df_sum.loc[idx_start:].copy()  # need .copy() or will get warning later on
df_plot.head()

In [0]:
# Calculate how many days to double
df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
df_plot

In [0]:
# Challenge 10: Plot days to double
# Advanced: Plot at same time as count using plotly subplots

In [0]:
def plot_doubling_for_location(df, country=None, state=None, num_start=100, description="", averaged_days=3):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=num_start][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
  df_plot = df_sum.loc[idx_start:].copy()
  df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
  location = location_name(country=country, state=state)
  fig = go.Figure(
    data=[go.Scatter(x=df_plot.index, y=df_plot['doubling days'], name=location, showlegend=True, mode="lines")],
    layout=go.Layout(
        title=go.layout.Title(text=f"{description} cases for {location} starting from {num_start} days to double"),
    )
  )
  return fig

plot_doubling_for_location(df_confirmed, country="Australia", description="Confirmed")

In [0]:
# Solution to challenge 10
def plot_doubling_for_location(df, country=None, state=None, num_start=100, description="", averaged_days=3):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=num_start][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
  df_plot = df_sum.loc[idx_start:].copy()
  df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
  location = location_name(country=country, state=state)
  fig = go.Figure(
    data=[go.Scatter(x=df_plot.index, y=df_plot['doubling days'], name=location, showlegend=True, mode="lines")],
    layout=go.Layout(
        title=go.layout.Title(text=f"Days to double averaged over last {averaged_days} days. Higher is better")
    )
  )
  return fig

plot_doubling_for_location(df_confirmed, country="Australia", description="Confirmed")

In [0]:
# Advanced solution to challenge 10
def plot_for_location(df, country=None, state=None, num_start=100, description="", averaged_days=3):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=num_start][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
  df_plot = df_sum.loc[idx_start:].copy()
  df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
  location = location_name(country=country, state=state)
  fig = plotly.subplots.make_subplots(
    rows=3, cols=1, shared_xaxes=True, 
    specs=[[{"rowspan": 2}], [None], [{}]],
    subplot_titles=["Confirmed cases on a logarithmic scale",  
                  f"Days to double averaged over last {averaged_days} days. Higher is better"]
  )
  fig.update_layout(
    title_text=f"{description} cases {location} starting from {num_start}", 
    height=600
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['current'], mode='lines', name=location),
    row=1, col=1
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['doubling days'], mode='lines', showlegend=False),
    row=3, col=1
  )
  fig.update_yaxes(title_text='Cases', type='log', row=1, col=1)
  idx_end = df_plot.index[-1]
  duration = (idx_end - idx_start).days
  doubler = 6  # draw a line for doubling every 6 days
  num_start_actual = df_plot.loc[idx_start, 'current']
  num_end = int(num_start_actual * 2 ** (duration / doubler))
  fig.add_trace(
    go.Scatter(x=[idx_start, idx_end], y=[num_start_actual, num_end], mode='lines', name=f'every {doubler} days'),
      row=1, col=1
  )
  return fig

plot_for_location(df_confirmed, country="Australia", description="Confirmed")

In [0]:
# Challenge 11: Add lines on log chart for doubling every 2, 3, 4, 5 days

In [0]:
def plot_for_location(df, country=None, state=None, num_start=100, description="", averaged_days=3):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=num_start][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
  df_plot = df_sum.loc[idx_start:].copy()
  df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
  location = location_name(country=country, state=state)
  fig = plotly.subplots.make_subplots(
    rows=3, cols=1, shared_xaxes=True, 
    specs=[[{"rowspan": 2}], [None], [{}]],
    subplot_titles=["Confirmed cases on a logarithmic scale",  
                  f"Days to double averaged over last {averaged_days} days. Higher is better"]
  )
  fig.update_layout(
    title_text=f"{description} cases {location} starting from {num_start}", 
    height=600
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['current'], mode='lines', name=location),
    row=1, col=1
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['doubling days'], mode='lines', showlegend=False),
    row=3, col=1
  )
  fig.update_yaxes(title_text='Cases', type='log', row=1, col=1)
  
  
  idx_end = df_plot.index[-1]
  duration = (idx_end - idx_start).days
  for doubler in (2, 3, 4, 5, 6):  # draw a line for doubling every 6 days
    num_start_actual = df_plot.loc[idx_start, 'current']
    num_end = int(num_start_actual * 2 ** (duration / doubler))
    fig.add_trace(
      go.Scatter(x=[idx_start, idx_end], y=[num_start_actual, num_end], mode='lines', name=f'every {doubler} days'),
        row=1, col=1
    )
  return fig

plot_for_location(df_confirmed, country="Australia", description="Confirmed", num_start=1500)

In [0]:
# Solution to challenge 11
def plot_for_location(df, country=None, state=None, num_start=100, description="", averaged_days=3):
  series_sum = series_sum_for_location(df, country=country, state=state)
  idx_start = series_sum.index[series_sum>=num_start][0]
  df_sum = pd.DataFrame(series_sum, columns=['current'])
  df_sum['previous'] = df_sum['current'].shift(averaged_days, fill_value=0)
  df_plot = df_sum.loc[idx_start:].copy()
  df_plot['doubling days'] = averaged_days / np.log2(df_plot['current'] / df_plot['previous']) 
  location = location_name(country=country, state=state)
  fig = plotly.subplots.make_subplots(
    rows=3, cols=1, shared_xaxes=True, 
    specs=[[{"rowspan": 2}], [None], [{}]],
    subplot_titles=["Confirmed cases on a logarithmic scale",  
                  f"Days to double averaged over last {averaged_days} days. Higher is better"]
  )
  fig.update_layout(
    title_text=f"{description} cases {location} starting from {num_start}", 
    height=600
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['current'], mode='lines', name=location),
    row=1, col=1
  )
  fig.add_trace(
    go.Scatter(x=df_plot.index, y=df_plot['doubling days'], mode='lines', showlegend=False),
    row=3, col=1
  )
  fig.update_yaxes(title_text='Cases', type='log', row=1, col=1)
  idx_end = df_plot.index[-1]
  duration = (idx_end - idx_start).days
  for doubler in (2, 3, 4, 5, 6):
    num_start_actual = df_plot.loc[idx_start, 'current']
    num_end = int(num_start_actual * 2 ** (duration / doubler))
    fig.add_trace(
      go.Scatter(x=[idx_start, idx_end], y=[num_start_actual, num_end], mode='lines', name=f'every {doubler} days'),
        row=1, col=1
    )
  return fig

plot_for_location(df_confirmed, country="Australia", description="Confirmed", averaged_days=5)