# A demo of how FacetsLabeling can be used to label and create a small ML dataset.  

* Currently there are no open source data labeling tools that can work from a notebook.  
* This labeling tool will help you visualize and improve the quality of your ML datasets.
* Directly visualize and find how your model is performing. 
* Correct mistakes in your labeled dataset and fix mislabeled examples. 
* The tool also helps you make a balanced ML dataset. 
* Train and evalute model performance through visualizaion. 


### Step 1: Downlaod and import FacetsLabeling library

In [None]:
!git clone https://github.com/jsiddique/facets_labeling.git

In [None]:
!git clone https://github.com/Geotab/facets-demo.git

In [None]:
from os import sys
sys.path.append('./facets_labeling/')

In [None]:
import numpy as np
import pandas as pd
from PIL import Image
from facets_labeling import colab_dive
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
plt.ioff()

### Step 2: Download dataset and load in dataframe.

In [None]:
df = pd.read_csv('facets-demo/CrankingVoltages.csv') # <-- Anonymized cranking voltage data
df = df.sort_values(by=['EventID', 'Milliseconds'])
df = df.reset_index(drop=True)

In [None]:
events = df[['EventID']].drop_duplicates()
events = events.reset_index(drop=True)

### Step 3: Aggregate and create features.

In [None]:
MaxVoltage = df[['EventID', 'Voltage']].groupby(by=['EventID']).max()
MaxVoltage = MaxVoltage.rename(index=str, columns={'Voltage': 'MaxVoltage'})

MinVoltage = df[['EventID', 'Voltage']].groupby(by=['EventID']).min()
MinVoltage = MinVoltage.rename(index=str, columns={'Voltage': 'MinVoltage'})

AvgVoltage = df[['EventID', 'Voltage']].groupby(by=['EventID']).mean()
AvgVoltage = AvgVoltage.rename(index=str, columns={'Voltage': 'AvgVoltage'})

### Step 4: Stitch all the images in a sprite atlas
To understand the concept of sprite atlas visit [this link](https://github.com/PAIR-code/facets/tree/master/facets_dive).

In [None]:
%%capture
!mkdir image_files

In [None]:
id_array = []
id_features = pd.DataFrame([], columns=['MaxVoltage', 'MinVoltage', 'AvgVoltage', 'EventSpan', 'MaxMinDiff', 'FirstMinDiff', 'LastVoltage'])
img_dim_inches = 1.5
img_dpi = 150
num_examples = 400

In [None]:

reRun = True

for i in range(400):
    id_array.append(events.iloc[i]['EventID'])
    
    example = pd.merge(df, events.iloc[[i]])
    id_features = pd.concat([id_features, example[['MaxVoltage', 'MinVoltage', 'AvgVoltage', 'EventSpan', 'MaxMinDiff', 'FirstMinDiff', 'LastVoltage']].drop_duplicates()], axis=0)
    
    if reRun:
        fig = plt.figure(figsize=(img_dim_inches, img_dim_inches), dpi=img_dpi)
        ax = fig.add_axes([0.17, 0.03, 0.81, 0.93])
        _ = ax.plot(example['Milliseconds'], example['Voltage'], linewidth=2, c='red', zorder=2)
        _ = ax.scatter(example['Milliseconds'], example['Voltage'], s=11, c='black', zorder=3)
        _ = ax.set_xticks([])
        _ = ax.set_ylim([8, example['Voltage'].max() + 1])
        _ = ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
        _ = ax.tick_params(axis='both', which='major', labelsize=6, pad=1)
        _ = ax.tick_params(axis='both', which='minor', labelsize=6, pad=1)
        _ = fig.savefig('./image_files/' + str(i).zfill(3) + '.png', transparent=False, dpi=img_dpi)
        _ = plt.close(fig)

In [None]:
num_examples = 400

In [None]:
img_arr = np.zeros([int(20*img_dim_inches*img_dpi), int(20*img_dim_inches*img_dpi), 4])

In [None]:
if reRun:
    for i in range(0, num_examples):
        colnum = i%int(20)
        rownum = int(i/int(20))
        cols = []
        tmp_img = np.array(Image.open('./image_files/' + str(i).zfill(3) + '.png').getdata()).reshape(int(img_dim_inches*img_dpi), int(img_dim_inches*img_dpi), 4)
        img_arr[int(rownum*img_dim_inches*img_dpi):int(rownum*img_dim_inches*img_dpi+img_dim_inches*img_dpi), 
                int(colnum*img_dim_inches*img_dpi):int(colnum*img_dim_inches*img_dpi+img_dim_inches*img_dpi), 
                :] = tmp_img

In [None]:
if reRun:
    imgout = Image.fromarray(img_arr.astype('uint8'))

In [None]:
if reRun:
    imgout.save('atlas_transparent.png', 'PNG')

In [None]:
if reRun:
    id_features.reset_index(inplace=True, drop=True)
    id_df = pd.DataFrame(id_array, columns=['Id'])
    id_df = pd.concat([id_df, id_features], axis=1)
    id_json = id_df.to_json(orient='records')

### Step 5: Create facets visulization

In [None]:
labels = ['Valid', 'Invalid']

img_dim_inches = 1.5
img_dpi = 150
num_examples = 400

import base64
with open("atlas_transparent.png", "rb") as image_file:
    encoded_string = base64.b64encode(image_file.read())
url_template = "data:image/png;base64,{encoded_string}"
from facets_labeling import colab_dive 
fc = colab_dive.Facets()
results = fc.create_classes(labels=labels)
fc.define_atlas(id_df, atlas_height=1000, sprite_width=int(img_dim_inches*img_dpi), sprite_height=int(img_dim_inches*img_dpi), atlas_url=url_template.format(encoded_string=encoded_string.decode("utf-8")))
fc.render_html('CrankingVoltage.html')

In [None]:
f = open('CrankingVoltage.html', 'r')
t = f.read()
f.close()
from IPython.display  import HTML
display(HTML(t))

##### Instructions on how to label
* Click reset to clear the cache. (Don't need to clear the cache if you want to keep labeling that's already done)
* Click select class dropdown list to see the list of classes. Select the class you want to label.   
* <b><i> Use shift+click to label instance </i></b>
* Click select class dropdown list and label another class
* <img src="https://raw.githubusercontent.com/jsiddique/facets_labeling/master/FacetsLabeling.gif">

### Step 6: Download labeled data from browser cache and store them in a dictionary

In [None]:
label_dict = {}

In [None]:
# Don't put any other code in this cell. It runs javascript code in the background.
fc.create_labeled_variables(label_dict=label_dict)

In [None]:
label_dict

In [None]:
# Inverse labelDict to eventDict
eventDict = {}
for key in label_dict.keys():
  labels = label_dict[key]
  eventIds = labels.split(',')
  for event in eventIds:
    eventDict[event] = key

In [None]:
# Add all the labels as a column in a dataframe. 
# Items that has the value None in the Labels column are not labeled by the user.
id_df['Labels'] = id_df.Id.apply(lambda x : eventDict[x] if x in eventDict else 'None')

### Step 7: Train a simple model using labels

In [None]:
# check distribution of the labels
id_df.Labels.value_counts()

In [None]:
# Create a dataframe that only contains items that are labeled by the user.
df_train = id_df.copy()
#df_train["Labels"] = id_df.Labels
df_train = df_train[df_train.Labels != 'None']
y_train = df_train.Labels.values
df_train = df_train.drop('Id', axis=1)
df_train = df_train.drop('Labels', axis=1)
X_train = np.array(df_train.values) 

In [None]:
# Train a simple RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
# make predictions using the trained model.
# Save the predictions in the dataframe for visualization in Step 5 again. 
df_test = id_df.copy()
df_test = df_test.drop('Id', axis=1)
df_test = df_test.drop('Labels', axis=1)
X_test = np.array(df_test.values) 
predictions = model.predict(X_test)
id_df['Predictions'] = predictions

### Step 8: Go to step 5 and create the visualization and retrain the models again. Visually validate how well the model is performing.  
* Thre will be a new column in the visualization called Predictions in the new visualization created in step 5. 
* Select this column and check how the predictions looks like. (See figure below)
* Fix mistakes by relabeling the examples.
* Retrain the model using newly fixed labels and visulize again.
* <img src = "https://github.com/jsiddique/facets_labeling/blob/master/check_predictions.png?raw=1"> 