In [None]:
from src.data import Dataset, available_datasets
from src import workflow

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()

In [None]:
available_datasets()

## In this example we'll use the f-mnist training data to build an initial model

In [None]:
ds = Dataset.load('f-mnist_train')

In [None]:
ds.data.shape

In [None]:
ds.target.shape

# Step 1: `make_train`

We're going to build an visulization tool for our stylists to use. Let's use the UMAP dimension reduction algorithm for this.

https://umap-learn.readthedocs.io/en/latest/

We can pip install umap via:

`pip install umap-learn`

Time to show off your new reproducible environment skills!

Add `umap-learn` under your pip requirements in the `environment.yml` and run `make requirements`.

In [None]:
from umap import UMAP

In [None]:
# We want a 2 dimensional visualization
model = UMAP(n_components=2, random_state=42)

In [None]:
%%time
model.fit(ds.data)

### Check that UMAP has desired properties

Recall that for an algorithm to work with our built in `make train` and `make predict` scripts, it needs to:
* be a BaseEstimator
* Have a fit method
* Have either a predict or transform method

Let's check that UMAP satisfies these properties.

In [None]:
from sklearn.base import BaseEstimator

In [None]:
isinstance(model, BaseEstimator)

In [None]:
hasattr(model, 'predict')

In [None]:
hasattr(model, 'transform')

### Exercise: Add UMAP to the available_algorithms

In [None]:
assert 'UMAP' in workflow.available_algorithms()

### Exercise: Add UMAP on f-mnist_train to the training workflow

In [None]:
# workflow.add_model()

In [None]:
workflow.get_model_list()

## TODO: Remove this from the actual tutorial notebook...this is an "answer"

In [None]:
workflow.add_model(
    dataset_name = 'f-mnist_train',
    algorithm_name = 'UMAP',
    algorithm_params = {'n_components': 2, 'random_state': 42}
)

In [None]:
!cd .. && make train

In [None]:
workflow.available_models()

# Step 2: `make_predict`

Here's where we use the model to do a dimension reduction.

In this case, we'll set the flag `is_supervised` to `False`, so that the predction algorithm uses the `tranform` method of UMAP instead of `predict`.

### Exercise: Setup dimension reduction using UMAP on f-mnist_train

In [None]:
# Fill me in!
# workflow.add_prediction()

In [None]:
workflow.get_prediction_list()

## TODO: Remove next line from the tutorial notebook...it's an answer!

In [None]:
workflow.add_prediction(
    dataset_name = 'f-mnist_train',
    model_name = 'UMAP_f-mnist_train_1',
    is_supervised = False
)

In [None]:
workflow.get_prediction_list()

In [None]:
!cd .. && make predict

In [None]:
workflow.available_predictions()

# Step 3: `make summary`

In this case, `make summary` will set up a dataframe that we'll use to make visualizations.



### Prototype what we want to do for our visualization

Let's visualize what happened to our data under dimension reduction. Since we have categories of our data (as the targets), let's use those to colour the data.

In [None]:
from src.models.predict import load_prediction

In [None]:
ds_predicted = load_prediction('UMAP_f-mnist_train_1_exp_f-mnist_train_1')

In [None]:
# Here's the 2d prediction
ds_predicted.data.shape

In [None]:
# Here are the labels. Use these as color!
ds_predicted.target.shape

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame()

for i, axis in enumerate(['x', 'y']):
    df[f'{axis}'] = ds_predicted.data[:, i]

df['target'] = ds_predicted.target.astype(int)
# if we want to do bokeh mouseover  from saved images: add images to the dataset, and add the following line
#df['filename'] = ds_predicted.metdata['filename'] 

This is probably all we need for `make summary`, but let's finish making a picture so that we can get a sense of whether we're missing anything.

In [None]:
# How many categories do we have? (so we can pick an appropriate colormap)
np.unique(df['target'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set(style='white', rc={'figure.figsize':(12,8)})


In [None]:
plt.scatter(df['x'], df['y'], c=df['target'], cmap='tab10', s=1)
plt.colorbar();

## TODO: Next step, script the summary

## Aside: How to make images..

In [None]:
from PIL import Image

In [None]:
row = ds.data[0]
row.shape

In [None]:
import numpy as np

w, h = 28, 28
new_row = np.reshape(row, (w, h))
new_row.shape

In [None]:
img = Image.fromarray(new_row, 'L')
img.show()

In [None]:
images = []
w, h = 28, 28

for row in ds.data:
    row = np.reshape(row, (w, h))
    img = Image.fromarray(new_row, 'L')
    images.append(img)

In [None]:
df['images'] = images

In [None]:
df.head()