### `Import the Main Libraries`

In [1]:
## Main Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Embeded Datasets
from sklearn import datasets

## For shuffling the Dataset
from sklearn import utils

## Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## Algorithms
from sklearn.neighbors import KNeighborsClassifier

## Metric
from sklearn.metrics import accuracy_score

### `Load the Dataset and Look at the big Picture`

In [2]:
## Load the iris dataset from the embeded datasets provided in sklearn
iris_dataset = datasets.load_iris()

## check the dataset by checking its keys
iris_dataset.keys()           ## each key has data or information

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
## Let's get all together and create a DF
## I will do it in very fast way and it is my preferable one, but there are many ways as we learned together 

df_iris = pd.DataFrame(np.c_[iris_dataset['data'], iris_dataset['target'].reshape(-1, 1)], 
                       columns=iris_dataset['feature_names']+['target'])

## check the head
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
## Let's shuffle the Dataset as you see that the target is ordered (0 then 1 then 2) --> Classification problem
## You can use function (sample) provided in pandas or use (shuffle) in sklearn
df_iris = utils.shuffle(df_iris, random_state=42)   ## shuffle and overwrite

## check the head again
df_iris.head()    ## OK

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
73,6.1,2.8,4.7,1.2,1.0
18,5.7,3.8,1.7,0.3,0.0
118,7.7,2.6,6.9,2.3,2.0
78,6.0,2.9,4.5,1.5,1.0
76,6.8,2.8,4.8,1.4,1.0


* > `The Dataset has 4 Features and no need for any encoding, we can only make standardization or Normalization, and the target ready (no need for encoding)`

### `Exploratory Data Analysis (EDA)`

In [5]:
## Do it Yourself

### `Preprocessing`
`The Dataset is very small, It is a Toy example, I will not split the data`

In [6]:
### Split the Dataset to Features and Target
X = df_iris.drop(columns=['target'], axis=1)
y = df_iris['target']

In [7]:
## Let's impute (although there is no nulls) and standardize the Dataset ---> All in Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
                            ])

X_train = num_pipeline.fit_transform(X)
y_train = y.copy()   ## doesn't matter

### `Building a ML Model`
* ` ---> Classification Problem`

In [8]:
## Let's imagine an Algorithm called (KNN)
knn_clf = KNeighborsClassifier(n_neighbors=5)

knn_clf.fit(X_train, y_train)   ## fit

KNeighborsClassifier()

In [9]:
## Inference and predict
y_pred_knn = knn_clf.predict(X_train)
y_pred_knn

array([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2.,
       1., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0., 0., 0., 1., 0.,
       0., 2., 1., 0., 0., 0., 2., 1., 1., 0., 0., 1., 1., 2., 1., 2., 1.,
       2., 1., 0., 2., 1., 0., 0., 0., 1., 2., 0., 0., 0., 1., 0., 1., 2.,
       0., 1., 2., 0., 2., 2., 1., 1., 2., 1., 0., 1., 2., 0., 0., 1., 1.,
       0., 2., 0., 0., 2., 1., 2., 2., 1., 2., 1., 0., 0., 1., 2., 0., 0.,
       0., 2., 2., 0., 2., 2., 0., 1., 1., 2., 1., 2., 0., 2., 1., 2., 1.,
       1., 1., 0., 1., 1., 0., 1., 2., 2., 0., 1., 2., 2., 0., 2., 0., 1.,
       2., 2., 1., 2., 1., 1., 2., 2., 0., 1., 1., 0., 1., 2.])

In [10]:
## What about Evaluation Metric --> Let's imagine a classification Metric called (accuracy) 
## Accuarcy is the total_number of true labeled instances to full_number of data

## Stack both the Ground Truth and the predicted labels using KNN 
df_comapre = pd.DataFrame(np.column_stack((y, y_pred_knn)), columns=['Ground Truth', 'Predicted (KNN)'])
df_comapre

Unnamed: 0,Ground Truth,Predicted (KNN)
0,1.0,1.0
1,0.0,0.0
2,2.0,2.0
3,1.0,1.0
4,1.0,1.0
...,...,...
145,1.0,1.0
146,2.0,1.0
147,0.0,0.0
148,1.0,1.0


In [11]:
## Using the Function provided by sklearn
knn_acc_score = accuracy_score(y, y_pred_knn)
print(f'Accuracy Score using KNN --- {100*knn_acc_score:.2f} %')

Accuracy Score using KNN --- 95.33 %


In [12]:
def my_accuracy_score(y_true, y_pred):
    ''' This Function tries to get the accuracy score, simulating the accuracy_score function provided in sklearn
    Args:
    ****
        (y_true: 1D array) --> NumPy 1D array or pandas Series
        (y_pred: 1D array) --> NumPy 1D array or pandas Series
    Returns:
    *******
        return the accuracy_score
    '''
    return np.sum(y_true==y_pred)/len(y_true)


## call the function
my_accuracy_score(y_true=y, y_pred=y_pred_knn)

0.9533333333333334

### More Details Later