In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import plotly.express as px
import pickle


In [31]:
def sklearn_to_df(sklearn_dataset):
    df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    df['target'] = pd.Series(sklearn_dataset.target)
    return df

# convert sklearn dataset to pd
iris = sklearn_to_df(datasets.load_iris())

iris.rename(columns = {'target':'species'},inplace = True)

# rename column 'species' from numbers to species names
iris['species']=iris['species'].replace([0,1,2],['setosa','versicolor','virginica'])

# make sure df prints on one line
pd.set_option('expand_frame_repr', False)
print(iris.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm) species
0                5.1               3.5                1.4               0.2  setosa
1                4.9               3.0                1.4               0.2  setosa
2                4.7               3.2                1.3               0.2  setosa
3                4.6               3.1                1.5               0.2  setosa
4                5.0               3.6                1.4               0.2  setosa


In [32]:
print(iris.describe())

       sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
count         150.000000        150.000000         150.000000        150.000000
mean            5.843333          3.057333           3.758000          1.199333
std             0.828066          0.435866           1.765298          0.762238
min             4.300000          2.000000           1.000000          0.100000
25%             5.100000          2.800000           1.600000          0.300000
50%             5.800000          3.000000           4.350000          1.300000
75%             6.400000          3.300000           5.100000          1.800000
max             7.900000          4.400000           6.900000          2.500000


In [33]:
# plot data for sepal width and length

figure = px.scatter(iris, x='sepal width (cm)', y='sepal length (cm)', color = 'species')
figure.show()

In [34]:
fig = px.scatter_matrix(iris, dimensions=["sepal width (cm)", "sepal length (cm)", "petal width (cm)", "petal length (cm)"], color="species")
fig.show()

In [35]:
fig = px.scatter(iris, x="sepal width (cm)", y="sepal length (cm)", color="species", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [36]:
# train ML model using KNN classification algorithm

# separate 'species' from the rest of data
x = iris.drop(columns = 'species')
y = iris['species']

In [37]:

# split data to train and test samples
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3,random_state = 0)

# train model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [38]:
# test on one example
x_new = np.array([[5, 2.9, 1, 0.2]])
prediction = knn.predict(x_new)
print("Prediction: {}".format(prediction))

Prediction: ['setosa']



X does not have valid feature names, but KNeighborsClassifier was fitted with feature names



In [39]:
# model evaluation 

# predict from the test dataset
predictions = knn.predict(x_test)
accuracy_score(y_test, predictions)

0.9777777777777777

In [40]:
# detailed classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        16
  versicolor       1.00      0.94      0.97        18
   virginica       0.92      1.00      0.96        11

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [41]:
# save the model
with open('KNN.pickle', 'wb') as f:
    pickle.dump(knn, f)

# Load the model
with open('KNN.pickle', 'rb') as f:
    model = pickle.load(f)
model.predict(x_new)


X does not have valid feature names, but KNeighborsClassifier was fitted with feature names



array(['setosa'], dtype=object)