In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy.stats as stats

### Read in data: use pandas
To deal with column of different types (col 1 has the dates as strings), we use pandas.
You could use numpy with a little extra work.

In [None]:
filename = "flu.csv"
data = pd.read_csv(filename, skiprows=1, quotechar='\'')

In [None]:
columns = list(data.columns)
print(columns)

Print means and standard errors, just for kicks.

(nb: The standard error doesn't have much meaning for the time-series data here!)

In [None]:
for col in columns:
    if col == columns[0]: continue
    mean = np.mean(data[col])
    error = stats.sem(data[col])
    print(f"{col:9} {mean:.2f} +/- {error:.2f}")

### Select the columns of just the region data (drop first + last)

In [None]:
regions = columns[1:len(columns)-1]
print(regions)

### Histogram

In [None]:
random_col = np.random.randint(1, len(columns)-2)
col = columns[random_col]
print(random_col)

plt.hist(data[col])
plt.xlabel("Fluiness")
plt.ylabel("Count")
plt.show()

### Just for the sake of it: KDE

In [None]:
for region in regions:
    x = np.linspace(min(data[region]), max(data[region]), 100)
    kde = stats.gaussian_kde(data[region])
    plt.plot(x, kde(x), linewidth = 2, label=region)
    plt.fill_between(x, kde(x), alpha=0.4)
plt.xlabel("Fluiness")
plt.ylabel("KDE")
plt.legend()
plt.show()

### Quantile

In [None]:
x = np.linspace(0.05, 1.0, 20)
for region in regions:
    set = data[region]
    q = np.quantile(set, x)
    # nb: change units, just in plot quantile -> percentile
    plt.plot(x, q, label=region)
plt.ylabel("Fluiness")
plt.xlabel("Quantile")
plt.legend()
plt.show()

### Q-Q plot

In [None]:
col1 = "MidAtl"
col2 = "SAtl"
x = np.linspace(0.05, 1.0, 20)
q1 = np.quantile(data[col1], x)
q2 = np.quantile(data[col2], x)
axmin = min(min(data[col1]), min(data[col2]))
axmax = max(max(data[col1]), max(data[col2]))
plt.axline((0.5, 0.5), slope=1, color="r")
plt.plot(q1,q2,"-", linewidth=2)
plt.plot(q1,q2,"k.")
plt.gca().set_aspect('equal')
plt.xticks()
plt.ylim(0.95*axmin, 1.05*axmax)
plt.xlim(0.95*axmin, 1.05*axmax)
plt.ylabel(col1 + " fluiness")
plt.xlabel(col2 + " fluiness")
plt.show()

### Tukey mean-difference plot

Given two quantiles, $q_1(x)$ and $q_2(x)$, 
the Tukey mean difference plot is a plot of $Y$ vs. $X$, where:
$$ Y(x) = q_2(x) - q_1(x) $$
$$ X(x) = \frac{1}{2}\left[ q_1(x) + q_2(x) \right] $$
That is, it plots the difference of the quantiles against their average. The advantage of the Tukey mean-difference compared to the q-q plot is that it converts interpretation of the differences around a 45 degree diagonal line to interpretation of differences around a horizontal zero line. 

In [None]:
x = 0.5*(q1+q2)
y = q2-q1
plt.plot(x,y,"-", linewidth=2)
plt.plot(x,y,"k.")
plt.axline((0.5, 0.0), slope=0, color="r")
plt.show()

### Extract and convert the dates (from text to 'datetime') to use in labels, and for plotting as time-series:

(Technically time-series data, not just univariate data)

So, do this relative to the 1/1/2006 (will run from -ve to +ve)

In [None]:
from datetime import datetime

dates = [datetime.strptime(date, '%m/%d/%Y') for date in data["Date"]]

plt.plot(dates, data["MidAtl"])
plt.show()

### Heat maps (all regions on same plot)

In [None]:
import seaborn as sns

region_data = data[regions]

ax = sns.heatmap(region_data)
plt.show()

### it would be better the other way...

In [None]:
ax = sns.heatmap(region_data.transpose())
plt.show()

### ... and what about actual labels, instead of just column numbers:

In [None]:
date_labels = []
for i, date in enumerate(dates):
  label = str(date.day) + "-" + date.strftime("%b")+ "-" + date.strftime("%y")
  date_labels.append(label)

print(date_labels)

ax = sns.heatmap(region_data.transpose(), xticklabels=date_labels)
plt.show()

### Ew, let's try that again

In [None]:
date_labels = []
for i, date in enumerate(dates):
  label = ""
  # Only use every 10th date as a label, leave the rest blank
  if i%10 == 0:
    label = str(date.day) + "-" + date.strftime("%b")+ "-" + date.strftime("%y")
  date_labels.append(label)

print(date_labels)

ax = sns.heatmap(region_data.transpose(), xticklabels=date_labels)
plt.show()

### Rotate the labels, for easier reading. And change colourmap colour

In [None]:
print(list(plt.colormaps))

ax = sns.heatmap(region_data.transpose(), xticklabels=date_labels, cmap = "icefire")
ax.set_title("Fluiness")
ax.set_xticklabels(date_labels, rotation=50)
plt.show()

### 6. Bar charts

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=1)
axs[0].bar(dates, data["Mtn"], align='center', width=5)
axs[0].bar(dates, data["NE"], align='center', width=5, alpha=0.5)
axs[0].set_ylabel("Fluiness")
axs[1].barh(dates, data["Mtn"], align='center', height=5, label="Mtn")
axs[1].barh(dates, data["NE"], align='center', height=5, alpha=0.5,  label="NE")
axs[1].set_xlabel("Fluiness")
plt.legend()
plt.show()

### Stairs, stem:

In [None]:
plt.stairs(data["Mtn"])
plt.show()

In [None]:
plt.stem(data["Mtn"])
plt.show()

### Sparklines

In [None]:
fig, axs = plt.subplots(nrows=len(regions), ncols=1)
for i, region in enumerate(regions):
    axs[i].plot(data[region], color=f"C{i}")
    axs[i].text(40, 1.5, region, color=f"C{i}")
    axs[i].axis("off")
plt.show()

### We can leave the date labels if we like
* You can probably make this look nicer!

In [None]:
fig, axs = plt.subplots(nrows=len(regions), ncols=1)
for i, region in enumerate(regions):
    axs[i].plot(dates, data[region], color=f"C{i}")
    axs[i].text(dates[40], 1.35, region, color=f"C{i}")
    axs[i].axis("off")
axs[-1].axis("on")
axs[-1].set_yticks([])
for pos in ['right', 'top', 'left']:
   plt.gca().spines[pos].set_visible(False)
plt.show()

### Steam graph (stacked)

In [None]:
plt.stackplot(dates, region_data.transpose())
ax.axhline(0, color="black", ls="--")
plt.show()

In [None]:
plt.stackplot(dates, 0.5*region_data.transpose())
# Reset the colour cycles:
plt.gca().set_prop_cycle(None)
plt.stackplot(dates, -0.5*region_data.transpose())
ax.axhline(0, color="black", ls="--")
# plt.axis("off")
plt.yticks([])
for pos in ['right', 'top','bottom', 'left']:
   plt.gca().spines[pos].set_visible(False)
plt.show()

### .. or spaced

In [None]:
fig, axs = plt.subplots(nrows=len(regions), ncols=1)
for i, region in enumerate(regions):
    # Sorry: this was the  mistake:
    # axs[i].stackplot(dates, 0.5*region_data.transpose(), color=f"C{i}")
    # axs[i].stackplot(dates, -0.5*region_data.transpose(), color=f"C{i}")
    # Here is corrected
    y1 = 0.5*data[region]
    y2 = -0.5*data[region]
    axs[i].fill_between(dates, y1, y2, color=f"C{i}")
    axs[i].text(dates[40], 1.5, region, color=f"C{i}")
    axs[i].axis("off")
axs[-1].axis("on")
axs[-1].set_yticks([])
for pos in ['right', 'top', 'left']:
   plt.gca().spines[pos].set_visible(False)
plt.show()

### Box plots

In [None]:
plt.boxplot(region_data, patch_artist = True, notch = True, showmeans = True)
plt.xticks(list(range(1, len(regions)+1)), regions, rotation=50)
plt.ylabel("Fluiness")
plt.show()