# Finding iris species

The problem is similar to finding the boundaries in a phase transition problem.

To make slides:

1. Run notebook

2. Save notebook

3. Execute

```
jupyter nbconvert 02-iris.ipynb --to slides --post serve
```

## Reference

https://medium.com/@mjspeck/presenting-code-using-jupyter-notebook-slides-a8a3c3b59d67


## Setings
### Notebook configuration

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### Packages

In [None]:
import os  # manage path and os interaction
import numpy as np  # manage array and math operation
import pandas as pd  # manage table

from tqdm.notebook import tqdm

import matplotlib as mpl  # plot data
import matplotlib.pyplot as plt  # plot data

In [None]:
FIGSIZE = (10, 10)

## Import data

In [None]:
data_path = '../data/iris/Iris.csv'
df = pd.read_csv(data_path)
target_col = "Species"

## Explore data

In [None]:
df.head(n=3)

### Input and target cols

In [None]:
def plot_target_bar(df, taget_col, figsize=FIGSIZE):
    plt.figure(figsize=figsize)
    plt.title("Target distribution")
    df[target_col].value_counts().plot(kind="bar")
    plt.xticks(rotation=0)
    plt.legend()
    plt.show()

In [None]:
 plot_target_bar(df, target_col, figsize=FIGSIZE)

In [None]:
label_0 = df[target_col].unique()[0]
label_1 = df[target_col].unique()[1]
label_2 = df[target_col].unique()[2]
print(label_0, label_1, label_2)

In [None]:
input_cols = [col for col in df.columns if col not in ["Id", target_col]]
print(input_cols)

## Scatter plots

### Sepal analysis

In [None]:
df_0 = df[df[target_col]==label_0]
df_1 = df[df[target_col]==label_1]
df_2 = df[df[target_col]==label_2]

In [None]:
def scatter_1(sz_inc=50):
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=df_0[input_cols[0]], y=df_0[input_cols[1]], s=sz_inc*df_0[input_cols[3]], marker='^', c='r', label=label_0)
    plt.scatter(x=df_1[input_cols[0]], y=df_1[input_cols[1]], s=sz_inc*df_1[input_cols[3]], marker='o', c='y', label=label_1)
    plt.scatter(x=df_2[input_cols[0]], y=df_2[input_cols[1]], s=sz_inc*df_2[input_cols[3]], marker='*', c='b', label=label_2)

    plt.legend()

    plt.xlabel(input_cols[0])
    plt.ylabel(input_cols[1])
    plt.title(f"Iris `{target_col}` given by `{input_cols[2]}`")

    plt.show()

In [None]:
scatter_1()

### Petal analysis

In [None]:
def scatter_2(sz_inc=20):
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=df_0[input_cols[2]], y=df_0[input_cols[3]], s=sz_inc*df_0[input_cols[1]], marker='^', c='r', label=label_0)
    plt.scatter(x=df_1[input_cols[2]], y=df_1[input_cols[3]], s=sz_inc*df_1[input_cols[1]], marker='o', c='y', label=label_1)
    plt.scatter(x=df_2[input_cols[2]], y=df_2[input_cols[3]], s=sz_inc*df_1[input_cols[1]], marker='*', c='b', label=label_2)

    plt.legend()

    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Iris `{target_col}` given by `{input_cols[1]}`")

    plt.show()

In [None]:
scatter_2()

### Contour plot example

#### Dummy Example

In [None]:
x_min, x_max = 1, 10
y_min, y_max = 5, 15
h = 0.1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
zz_mult = np.multiply(xx, yy)

plt.figure(figsize=FIGSIZE)
plt.contourf(xx, yy, zz_mult, cmap=plt.cm.RdBu, alpha=.8)
plt.show()

#### Application for iris

In [None]:
# Prepare mesh
cm = plt.cm.RdYlBu

x_min, x_max = 0.9*df[input_cols[2]].min(), 1.1*df[input_cols[2]].max()
y_min, y_max = 0 * 0.7*df[input_cols[3]].min(), 1.1*df[input_cols[3]].max()

xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max - x_min) / 100 ),
                     np.arange(y_min, y_max, (y_max - y_min) / 100 ))

In [None]:
zz_eng = np.zeros(xx.shape)

# Classify species
for x_id, x_val in enumerate(xx[0]):
    for y_id, y_val in enumerate(yy[:,0]):
        if (x_val < 2.) and (y_val < 0.7):
            zz_eng[y_id, x_id] = 1
        elif ((x_val > 2.8) and (x_val < 5)) and ((y_val > 0.9) and (y_val < 1.65)):
            zz_eng[y_id, x_id] = 2
        elif ((x_val > 5)  and (x_val < 7)) and ((y_val > 1.6) and (y_val < 2.55)):
            zz_eng[y_id, x_id] = 3
        else:
            zz_eng[y_id, x_id] = np.nan

In [None]:
def scatter_3(sz_inc=20):
    ## View results
    plt.figure(figsize=FIGSIZE)

    plt.contourf(xx, yy, zz_eng, cmap=cm, alpha=.8)

    plt.scatter(x=df_0[input_cols[2]], y=df_0[input_cols[3]], s=sz_inc*df_0[input_cols[1]], marker='^', c='r', label=label_0)
    plt.scatter(x=df_1[input_cols[2]], y=df_1[input_cols[3]], s=sz_inc*df_1[input_cols[1]], marker='o', c='y', label=label_1)
    plt.scatter(x=df_2[input_cols[2]], y=df_2[input_cols[3]], s=sz_inc*df_1[input_cols[1]], marker='*', c='b', label=label_2)


    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Manual separation of species")
    plt.legend()
    plt.xlim(x_min, x_max)
    plt.show()

In [None]:
scatter_3()

## Use machine learning

### Encode target

The label encoder is equivalent to a map with the index of the species as defined previsious.

But the method is more generic and should preferent over tweeking aroung.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df[target_col].values)

print("Class of the encoder")
print(list(le.classes_))
print("")
print("Associated values")
print(le.transform(list(le.classes_)))

### Split dataset

In [None]:
from sklearn.model_selection import train_test_split

## Define input and output
X, y = df[input_cols], le.transform(df[target_col])

## Split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def scatter_4(sz_inc=20):
    plt.figure(figsize=FIGSIZE)
    plt.scatter(x=X_train[input_cols[2]], y=X_train[input_cols[3]], s=sz_inc*X_train[input_cols[1]], 
                marker='d', c='m', label="train")
    plt.scatter(x=X_test[input_cols[2]], y=X_test[input_cols[3]], s=sz_inc*X_test[input_cols[1]],
                marker='s', c='c', alpha=0.7, label="test")
    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Iris train-test split")
    plt.legend()
    plt.show()

In [None]:
scatter_4()

### Train classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
# Check the paramter to optimize the fit

clf_0 = CatBoostClassifier(learning_rate=3e-2, early_stopping_rounds=20, iterations=10000)
clf_0.fit(X_train, y_train, plot=True, logging_level='Silent')
y_pred_0 = clf_0.predict(X_test)

In [None]:
clf_1 = CatBoostClassifier(loss_function='MultiClass',
                           iterations=25,
                           learning_rate=1,
                           depth=4)
clf_1.fit(X_train, y_train, plot=True, logging_level='Silent')
y_pred_1 = clf_1.predict(X_test)

In [None]:
y_pred = y_pred_1.ravel()

### View results

In [None]:
def scatter_5(sz_inc=20):
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=X_test[y_test==0][input_cols[2]],
                y=X_test[y_test==0][input_cols[3]],
                s=sz_inc*X_test[y_test==0][input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=X_test[y_test==1][input_cols[2]],
                y=X_test[y_test==1][input_cols[3]],
                s=sz_inc*X_test[y_test==1][input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=X_test[y_test==2][input_cols[2]],
                y=X_test[y_test==2][input_cols[3]],
                s=sz_inc*X_test[y_test==2][input_cols[1]],
                marker='*', c='b', label=label_2)

    plt.scatter(x=X_test[y_pred==0][input_cols[2]].values,
                y=X_test[y_pred==0][input_cols[3]].values,
                s=sz_inc*20,
                marker='^', edgecolors='r', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_test[y_pred==1][input_cols[2]].values,
                y=X_test[y_pred==1][input_cols[3]].values,
                s=sz_inc*20,
                marker='o', edgecolors='y', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_test[y_pred==2][input_cols[2]].values,
                y=X_test[y_pred==2][input_cols[3]].values,
                s=sz_inc*20,
                marker='*', edgecolors='b', facecolors='none', label=label_0 + '_pred')


    plt.legend()

    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Prediction on test set")

    plt.show()

In [None]:
scatter_5()

### Draw contour

In [None]:
zz_pred = np.zeros(xx.shape)

med_0 = df[input_cols[0]].median()
med_1 = df[input_cols[1]].median()

for x_id, x_val in enumerate(tqdm(xx[0])):
    for y_id, y_val in enumerate(yy[:,0]):
        zz_pred[y_id, x_id] = clf_1.predict([med_0, med_1, x_val, y_val])

In [None]:
def scatter_6(sz_inc=20):
    ## View results
    plt.figure(figsize=FIGSIZE)

    plt.contourf(xx, yy, zz_pred, cmap=cm, alpha=.8)

    plt.scatter(x=df_0[input_cols[2]], y=df_0[input_cols[3]], s=sz_inc*df_0[input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=df_1[input_cols[2]], y=df_1[input_cols[3]], s=sz_inc*df_1[input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=df_2[input_cols[2]], y=df_2[input_cols[3]], s=sz_inc*df_1[input_cols[1]],
                marker='*', c='b', label=label_2)


    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Manual separation of species")
    plt.legend()
    plt.xlim(x_min, x_max)
    plt.show()

In [None]:
scatter_6()

### Compute metric 

In [None]:
y_train_pred = clf_1.predict(X_train).ravel()

In [None]:
from sklearn.metrics import accuracy_score

acc_test = accuracy_score(y_test, y_pred)
acc_train = accuracy_score(y_train, y_train_pred)

In [None]:
print("Results on test")
print(acc_test)
print(all(y_test == y_pred))

print("")

print("Results on train")
print(acc_train)
print(all(y_train == y_train_pred))

#### Confirm train results

In [None]:
def scatter_6(sz_inc=20):
    plt.figure(figsize=FIGSIZE)
    plt.scatter(x=X_train[y_train==0][input_cols[2]],
                y=X_train[y_train==0][input_cols[3]],
                s=sz_inc*X_train[y_train==0][input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=X_train[y_train==1][input_cols[2]],
                y=X_train[y_train==1][input_cols[3]],
                s=sz_inc*X_train[y_train==1][input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=X_train[y_train==2][input_cols[2]],
                y=X_train[y_train==2][input_cols[3]],
                s=sz_inc*X_train[y_train==2][input_cols[1]],
                marker='*', c='b', label=label_2)

    plt.scatter(x=X_train[y_train_pred==0][input_cols[2]].values,
                y=X_train[y_train_pred==0][input_cols[3]].values,
                s=sz_inc*20,
                marker='^', edgecolors='r', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_train[y_train_pred==1][input_cols[2]].values,
                y=X_train[y_train_pred==1][input_cols[3]].values,
                s=sz_inc*20,
                marker='o', edgecolors='y', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_train[y_train_pred==2][input_cols[2]].values,
                y=X_train[y_train_pred==2][input_cols[3]].values,
                s=sz_inc*20,
                marker='*', edgecolors='b', facecolors='none', label=label_0 + '_pred')
    plt.legend()
    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Prediction on full dataset")
    plt.show()

In [None]:
scatter_6()

#### Focus

In [None]:
X_focus = X_train[(X_train[input_cols[2]]>4.5) & (X_train[input_cols[2]]<5.25) 
                   & (X_train[input_cols[3]]>1.4) & (X_train[input_cols[3]]<2)]
y_focus = y_train[(X_train[input_cols[2]]>4.5) & (X_train[input_cols[2]]<5.25) 
                   & (X_train[input_cols[3]]>1.4) & (X_train[input_cols[3]]<2)]
y_focus_pred = y_train_pred[(X_train[input_cols[2]]>4.5) & (X_train[input_cols[2]]<5.25) 
                   & (X_train[input_cols[3]]>1.4) & (X_train[input_cols[3]]<2)]

In [None]:
def scatter_7(sz_inc=20):
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=X_focus[y_focus==0][input_cols[2]],
                y=X_focus[y_focus==0][input_cols[3]],
                s=sz_inc*X_focus[y_focus==0][input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=X_focus[y_focus==1][input_cols[2]],
                y=X_focus[y_focus==1][input_cols[3]],
                s=sz_inc*X_focus[y_focus==1][input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=X_focus[y_focus==2][input_cols[2]],
                y=X_focus[y_focus==2][input_cols[3]],
                s=sz_inc*X_focus[y_focus==2][input_cols[1]],
                marker='*', c='b', label=label_2)

    plt.scatter(x=X_focus[y_focus_pred==0][input_cols[2]].values,
                y=X_focus[y_focus_pred==0][input_cols[3]].values,
                s=sz_inc*20,
                marker='^', edgecolors='r', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==1][input_cols[2]].values,
                y=X_focus[y_focus_pred==1][input_cols[3]].values,
                s=sz_inc*20,
                marker='o', edgecolors='y', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==2][input_cols[2]].values,
                y=X_focus[y_focus_pred==2][input_cols[3]].values,
                s=sz_inc*20,
                marker='*', edgecolors='b', facecolors='none', label=label_0 + '_pred')
    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Iris in mixed zone 1")

    plt.show()


In [None]:
scatter_7()

In [None]:
df[(df[input_cols[2]] < 4.85) & (df[input_cols[2]] > 4.75)
   & (df[input_cols[3]] < 1.85) & (df[input_cols[3]] > 1.75)]

In [None]:
def scatter_8(sz_inc=20):    
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=X_focus[y_focus==0][input_cols[2]],
                y=X_focus[y_focus==0][input_cols[3]],
                s=sz_inc*X_focus[y_focus==0][input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=X_focus[y_focus==1][input_cols[2]],
                y=X_focus[y_focus==1][input_cols[3]],
                s=sz_inc*X_focus[y_focus==1][input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=X_focus[y_focus==2][input_cols[2]],
                y=X_focus[y_focus==2][input_cols[3]],
                s=sz_inc*X_focus[y_focus==2][input_cols[1]],
                marker='*', c='b', label=label_2)

    plt.scatter(x=X_focus[y_focus_pred==0][input_cols[2]].values,
                y=X_focus[y_focus_pred==0][input_cols[3]].values,
                s=sz_inc*20,
                marker='^', edgecolors='r', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==1][input_cols[2]].values,
                y=X_focus[y_focus_pred==1][input_cols[3]].values,
                s=sz_inc*20,
                marker='o', edgecolors='y', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==2][input_cols[2]].values,
                y=X_focus[y_focus_pred==2][input_cols[3]].values,
                s=sz_inc*20,
                marker='*', edgecolors='b', facecolors='none', label=label_0 + '_pred')

    # plt.legend()

    plt.xlabel(input_cols[2])
    plt.ylabel(input_cols[3])
    plt.title(f"Iris in mixed zone 2")

    plt.xlim(4.75, 4.85)
    plt.ylim(1.75, 1.85)

    plt.show()

In [None]:
scatter_8()

#### Change axis

In [None]:
def scatter_9(sz_inc=20):
    plt.figure(figsize=FIGSIZE)

    plt.scatter(x=X_focus[y_focus==0][input_cols[0]],
                y=X_focus[y_focus==0][input_cols[1]],
                s=sz_inc*X_focus[y_focus==0][input_cols[1]],
                marker='^', c='r', label=label_0)
    plt.scatter(x=X_focus[y_focus==1][input_cols[0]],
                y=X_focus[y_focus==1][input_cols[1]],
                s=sz_inc*X_focus[y_focus==1][input_cols[1]],
                marker='o', c='y', label=label_1)
    plt.scatter(x=X_focus[y_focus==2][input_cols[0]],
                y=X_focus[y_focus==2][input_cols[1]],
                s=sz_inc*X_focus[y_focus==2][input_cols[1]],
                marker='*', c='b', label=label_2)

    plt.scatter(x=X_focus[y_focus_pred==0][input_cols[0]].values,
                y=X_focus[y_focus_pred==0][input_cols[1]].values,
                s=sz_inc*20,
                marker='^', edgecolors='r', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==1][input_cols[0]].values,
                y=X_focus[y_focus_pred==1][input_cols[1]].values,
                s=sz_inc*20,
                marker='o', edgecolors='y', facecolors='none', label=label_0 + '_pred')
    plt.scatter(x=X_focus[y_focus_pred==2][input_cols[0]].values,
                y=X_focus[y_focus_pred==2][input_cols[1]].values,
                s=sz_inc*20,
                marker='*', edgecolors='b', facecolors='none', label=label_0 + '_pred')


    plt.legend()

    plt.xlabel(input_cols[0])
    plt.ylabel(input_cols[1])
    plt.title(f"Iris in mixed zone 3")

    plt.show()

In [None]:
scatter_9()

In [None]:
df[(df[input_cols[0]] < 6.4) & (df[input_cols[0]] > 6.2) & (df[input_cols[1]] < 2.6) & (df[input_cols[1]] > 2.4)]

# Conclusion

Using all the all variable the classifier is able to detect all the case.

This task is not obvious for the human eye.

# End of script