In [None]:
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats
import pandas as pd

### Load in data (from unequal-length csv file)

In [None]:
filename = "heights.csv"
heights = np.genfromtxt(filename, delimiter=",", names=True)
collumns = list(heights.dtype.names)
print(collumns)

### Remove NaN's: print mean, and Standard Error

In [None]:
inch_to_cm = 2.54

data_sets = [
    heights[col][~np.isnan(heights[col])] * inch_to_cm for col in collumns
]
print("Average heights:")
for i, col in enumerate(collumns):
    print(f"{col:7}: {np.mean(data_sets[i]):.1f} +/- {stats.sem(data_sets[i]):.1f}")

print("\nModal heights:")
for i, col in enumerate(collumns):
    mode, count = stats.mode(data_sets[i], keepdims=True)
    print(f"{col:7}: {mode[0]:.0f}")

print("\nMedian heights:")
for i, col in enumerate(collumns):
    median = np.median(data_sets[i])
    print(f"{col:7}: {median:.0f}")

### Histograms - a few ways

In [None]:
altos, sopranos, tenors, bass = data_sets
fig, axs = plt.subplots(nrows=2, ncols=2)
fig.tight_layout(pad=2.0)  # add some space
axs[0, 0].hist(altos)
axs[0, 1].hist(sopranos)
axs[1, 0].hist(tenors)
axs[1, 1].hist(bass)
axs[0, 0].set_title("altos")
axs[0, 1].set_title("sopranos")
axs[1, 0].set_title("tenors")
axs[1, 1].set_title("bass")
fig.suptitle("Not ideal: axes differ", y = 1.05)
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2)
fig.tight_layout(pad=2.0)  # add some space

axs[0,0].hist(altos, density=True)
axs[0,1].hist(sopranos, density=True)
axs[1,0].hist(tenors, density=True)
axs[1,1].hist(bass, density=True)
axs[0,0].set_title("altos")
axs[0,1].set_title("sopranos")
axs[1,0].set_title("tenors")
axs[1,1].set_title("bass")
axs[0,0].set_ylabel("count")
axs[1,0].set_ylabel("count")
axs[1,0].set_xlabel("height (cm)")
axs[1,1].set_xlabel("height (cm)")
fig.suptitle("Better: consistant axes", y = 1.05)

min_height = np.min([np.min(altos), np.min(sopranos), np.min(tenors), np.min(bass)])
max_height = np.max([np.max(altos), np.max(sopranos), np.max(tenors), np.max(bass)])
plt.setp(axs, ylim=(0.0, 0.13), xlim=(min_height, max_height))

plt.show()

Sometime better to plot together.. sometimes just too messy

In [None]:
for i, data in enumerate(data_sets):
  plt.hist(data, bins='fd', histtype="step", linewidth=2)
  plt.hist(data, bins='fd', histtype="stepfilled",  color=f"C{i}", alpha=0.4, label = collumns[i])
  plt.fill()
plt.ylabel("count")
plt.xlabel("height (cm)")
plt.legend()
plt.show()

### KDE: often not a good idea
 * Kernel density estimation: estimates PDF (prob dist. function)
 * Certainly makes it easier to read
 * BUT at the expense of correctness in this case
 * Great for very dense histograms; usually best to show both

In [None]:
x = np.linspace(min_height, max_height, 100)

for i, data in enumerate(data_sets):
    kde = stats.gaussian_kde(data)
    # plt.hist(data, bins="fd", histtype="step", color=f"C{i}", linestyle="--", linewidth=2, density=True)
    plt.plot(x, kde(x), linewidth = 2, label=collumns[i])
    plt.fill_between(x, kde(x), alpha=0.4)
plt.ylabel("Prob. density")
plt.xlabel("height (cm)")
plt.legend()
plt.show()

### Quantiles

In [None]:
x = np.linspace(0.0, 1.0, 20)
q_altos = np.quantile(altos, x)
q_soprano = np.quantile(sopranos, x)
q_tenor = np.quantile(tenors, x)
q_bass = np.quantile(bass, x)

for i, data in enumerate(data_sets):
  q = np.quantile(data, x)
  # nb: change units, just in plot quantile -> percentile
  plt.plot(x*100, q, "x-", label=collumns[i])
plt.ylabel("Heights (cm)")
plt.xlabel("Percentile")
plt.legend()
plt.show()

### Q-Q (quantile vs. quantile) plot

In [None]:
x = np.linspace(0.05, 1.0, 20) # 20 quantiles (i.e. 5% steps)

fig, axs = plt.subplots(nrows=3, ncols=3)
fig.tight_layout(pad=1)  # add some space

x_datas = [altos, sopranos, tenors]
x_labels = ["Altos", "Sopranos", "Tenors"]

y_datas = [sopranos, tenors, bass]
y_labels = ["Sopranos", "Tenors", "Bass"]

for i, datay in enumerate(y_datas):
  for j, datax in enumerate(x_datas):
    # Only plot unique lower triangle
    if j > i:
      axs[i,j].set_visible(False)
      continue
    qx = np.quantile(datax, x)
    qy = np.quantile(datay, x)
    axs[i,j].set_visible(True)
    axs[i,j].set_yticks(np.arange(160, 195, 10.0))
    axs[i,j].set_xticks(np.arange(160, 195, 10.0))
    axs[i,j].set_xlim(160,195)
    axs[i,j].set_ylim(160,195)
    axs[i,j].plot(qx, qy, "-", linewidth=2)
    axs[i,j].plot(qx, qy, "k.")
    axs[i,j].axline((0, 0), slope=1, color="r")
    axs[i,j].set_aspect('equal')
    if i==2:
      axs[i,j].set_xlabel(x_labels[j], fontdict={'fontsize': 'large', 'fontweight': 'bold'})
    if j==0:
      axs[i,j].set_ylabel(y_labels[i], fontdict={'fontsize': 'large', 'fontweight': 'bold'})
fig.suptitle("Q-Q corner plot: heights (cm)", y=1.05)
plt.show()

In [None]:
x = np.linspace(0.05, 1.0, 20) # 20 quantiles (i.e. 5% steps)
qx = np.quantile(tenors, x)
qy = np.quantile(bass, x)

# plt.plot(qx,qy)
plt.plot(qx, qy, "-", linewidth=2)
plt.plot(qx, qy, "k.")
plt.ylim(164, 196)
plt.xlim(164, 196)
plt.axline((0, 0), slope=1, color="r")
plt.ylabel("Bass height (cm)")
plt.xlabel("Tenor height (cm)")
plt.gca().set_aspect('equal')
plt.show()

### Box-whisker plot

In [None]:

plt.boxplot(data_sets, patch_artist = True, notch = True, showmeans = True)
plt.xticks([1, 2, 3, 4], collumns)
plt.ylabel("Height (cm)")
plt.show()