# The Dangers of Trusting Statistics Alone: Lessons from the Datasaurus Dozen

---

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/Colab Notebooks/Capstone/data

/content/drive/MyDrive/Colab Notebooks/Capstone/data


In [3]:
pip install git+https://github.com/sztal/pybdm.git

Collecting git+https://github.com/sztal/pybdm.git
  Cloning https://github.com/sztal/pybdm.git to /tmp/pip-req-build-ss7i08dx
  Running command git clone --filter=blob:none --quiet https://github.com/sztal/pybdm.git /tmp/pip-req-build-ss7i08dx
  Resolved https://github.com/sztal/pybdm.git to commit 4bedea1200998a8978ada1a708b4e1f98aacb2d8
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pybdm
  Building wheel for pybdm (setup.py) ... [?25l[?25hdone
  Created wheel for pybdm: filename=pybdm-0.1.0-py2.py3-none-any.whl size=39933846 sha256=380a665b2ab327282bbe9cd8430c98307e1ab64583a988bb193d1249b4409cac
  Stored in directory: /tmp/pip-ephem-wheel-cache-n928uhs4/wheels/67/6e/1c/64fedb780569cd521fe6933032437f99c23618d9d40ad769fd
Successfully built pybdm
Installing collected packages: pybdm
Successfully installed pybdm-0.1.0


## Minimal Algorithmic Information Loss Methods
### Sanity Check
---

In [4]:
# Datasaurus Test
# https://github.com/alyssa-adams/pymils/blob/master/experiments.py
# https://www.dropbox.com/s/1zci0df6mny650t/SameStatsDataAndImages.zip?dl=0
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pybdm import BDM

images = []

file = "DatasaurusDozen.tsv"
df = pd.read_csv(file, sep='\t')
grouped_df = df.groupby(df.dataset)
groups = list(set(list(df['dataset'])))
groups = ['dino', 'slant_up', 'bullseye', 'away', 'v_lines', 'x_shape', 'star', 'dots', 'h_lines', 'slant_down', 'wide_lines', 'circle', 'high_lines']

for group in groups:

    points_df = grouped_df.get_group(group)
    coordinates = list(zip(list(points_df['x']), list(points_df['y'])))
    x, y = [int(i[0]) for i in coordinates], [int(i[1]) for i in coordinates]
    max_x, max_y = max(x), max(y)
    image = np.zeros((max_y + 1, max_x + 1))

    for i in range(len(coordinates)):
        image[max_y - y[i], x[i]] = 1

    image = image.astype(int)
    images.append(image)

# Initialize BDM
bdm = BDM(ndim=2)

# for each image, calculate BDM
bdm_row = []

for i, image in enumerate(images):

    bdm_value = bdm.bdm(np.array(image))

    group = groups[i]
    nrows_group = len(grouped_df.get_group(group).index)

    for n in range(nrows_group):
        bdm_row.append(bdm_value)

df['BDM'] = bdm_row

# plot bdm values
sns.scatterplot(data=df, x="dataset", y="BDM", s=50)
plt.xticks(rotation=45)
plt.tight_layout()
sns.despine()
sns.set_theme()
plt.savefig("datasaurus_bdm.pdf")
plt.clf()

# show a grid of the original images?
fig, axs = plt.subplots(4, 4)

axs[0, 0].imshow(np.invert(images[0]))
axs[0, 0].set_title(groups[0])
axs[0, 0].axes.xaxis.set_ticks([])
axs[0, 0].axes.yaxis.set_ticks([])

axs[1, 0].imshow(np.invert(images[1]))
axs[1, 0].set_title(groups[1])
axs[1, 0].axes.xaxis.set_ticks([])
axs[1, 0].axes.yaxis.set_ticks([])

axs[2, 0].imshow(np.invert(images[2]))
axs[2, 0].set_title(groups[2])
axs[2, 0].axes.xaxis.set_ticks([])
axs[2, 0].axes.yaxis.set_ticks([])

axs[3, 0].imshow(np.invert(images[3]))
axs[3, 0].set_title(groups[3])
axs[3, 0].axes.xaxis.set_ticks([])
axs[3, 0].axes.yaxis.set_ticks([])

axs[0, 1].imshow(np.invert(images[4]))
axs[0, 1].set_title(groups[4])
axs[0, 1].axes.xaxis.set_ticks([])
axs[0, 1].axes.yaxis.set_ticks([])

axs[1, 1].imshow(np.invert(images[5]))
axs[1, 1].set_title(groups[5])
axs[1, 1].axes.xaxis.set_ticks([])
axs[1, 1].axes.yaxis.set_ticks([])

axs[2, 1].imshow(np.invert(images[6]))
axs[2, 1].set_title(groups[6])
axs[2, 1].axes.xaxis.set_ticks([])
axs[2, 1].axes.yaxis.set_ticks([])

axs[3, 1].imshow(np.invert(images[7]))
axs[3, 1].set_title(groups[7])
axs[3, 1].axes.xaxis.set_ticks([])
axs[3, 1].axes.yaxis.set_ticks([])

axs[0, 2].imshow(np.invert(images[8]))
axs[0, 2].set_title(groups[8])
axs[0, 2].axes.xaxis.set_ticks([])
axs[0, 2].axes.yaxis.set_ticks([])

axs[1, 2].imshow(np.invert(images[9]))
axs[1, 2].set_title(groups[9])
axs[1, 2].axes.xaxis.set_ticks([])
axs[1, 2].axes.yaxis.set_ticks([])

axs[2, 2].imshow(np.invert(images[10]))
axs[2, 2].set_title(groups[10])
axs[2, 2].axes.xaxis.set_ticks([])
axs[2, 2].axes.yaxis.set_ticks([])

axs[3, 2].imshow(np.invert(images[11]))
axs[3, 2].set_title(groups[11])
axs[3, 2].axes.xaxis.set_ticks([])
axs[3, 2].axes.yaxis.set_ticks([])

axs[0, 3].imshow(np.invert(images[12]))
axs[0, 3].set_title(groups[12])
axs[0, 3].axes.xaxis.set_ticks([])
axs[0, 3].axes.yaxis.set_ticks([])

axs[1, 3].imshow([[0]])
axs[1, 3].axes.xaxis.set_ticks([])
axs[1, 3].axes.yaxis.set_ticks([])

axs[2, 3].imshow([[0]])
axs[2, 3].axes.xaxis.set_ticks([])
axs[2, 3].axes.yaxis.set_ticks([])

axs[3, 3].imshow([[0]])
axs[3, 3].axes.xaxis.set_ticks([])
axs[3, 3].axes.yaxis.set_ticks([])

fig.tight_layout()
plt.savefig("DatasaurusDozen.pdf")
plt.clf()

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

# References




* https://pybdm-docs.readthedocs.io/en/latest/
* https://github.com/alyssa-adams/pymils/



---
