<a href="https://colab.research.google.com/github/DJCordhose/buch-machine-learning-notebooks/blob/master/kap2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# kNN with Python

We are going to use the following libraries:

- matplotlib is a library for plotting data

- numpy is a library which we use for matrix and vector based calculations

- sklearn is a library for performing machine learning

You might need to import them (go to the annaconda prompt, typing: pip install Library_Name)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline
%pylab inline

In [None]:
import matplotlib.pylab as plt
import numpy as np

In [None]:
import sklearn
print(sklearn.__version__)

#### if you don't library on your PC then go to anaconda prompt:
pip install sklearn

## We are going to investigate the Iris data set for classifying species of plants

You can find details here:
https://de.wikipedia.org/wiki/Portal:Statistik/Datensaetze#Iris

### Let's get an overview what this dataset looks like

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
print(iris.DESCR)

In [None]:
# Here we are defining our feature vector X as well as our response y
X = iris.data
y = iris.target

In [None]:
# Looking at their shape, we realize that X is a matrix and y is a vector. We have 150 entered classified samples.
X.shape, y.shape

In [None]:
# Let's have a look how the first row of the matrix looks like
X[2]

In [None]:
# Looks like we have the numbers 1,2, and 3 for the three classcodes
y

In [None]:
# Let's create four features which we are going to use for the classification
X_sepal_length = X[:, 0]  # German translation: Kelchblatt
X_sepal_width =  X[:, 1]
X_petal_length = X[:, 2]  # German translation: Bl√ºtenblatt
X_petal_width = X[:, 3]

In [None]:
X_petal_width.shape

## Create a training and a test set

### We start with a ratio of 60% train and 40% test but please play with this paramater

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### Train a kNN classifier

In [None]:
from sklearn import neighbors

In [None]:
clf = neighbors.KNeighborsClassifier(1)

In [None]:
clf.fit(X_train, y_train)

### Draw the decision boundary

In [None]:
# In this section we are setting up the plot. There is no need that you understand every detail of it.

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
font_size=25

def meshGrid(x_data, y_data):
    h = .02  # step size in the mesh
    x_min, x_max = x_data.min() - 1, x_data.max() + 1
    y_min, y_max = y_data.min() - 1, y_data.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return (xx,yy)
    
def plotPrediction(clf, x_data, y_data, x_label, y_label, colors, title="", mesh=True):
    xx,yy = meshGrid(x_data, y_data)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(20,10))
    if mesh:
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.scatter(x_data, y_data, c=colors, cmap=cmap_bold, s=80, marker='o')
    plt.xlabel(x_label, fontsize=font_size)
    plt.ylabel(y_label, fontsize=font_size)
    plt.title(title, fontsize=font_size)

In [None]:
X_train_sepal_only = X_train[:, :2]
X_test_sepal_only = X_test[:, :2]

### kNN with k = 1 for sepal features

In [None]:
clf_sepal = neighbors.KNeighborsClassifier(1)
clf_sepal.fit(X_train_sepal_only, y_train)

In [None]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train, mesh=False,
                title="Train Data for Sepal Features")
# plt.savefig('ML_0201.png', bbox_inches='tight')

#### High classification accuracy for the training data but low accuracy for the test data. 
#### Do these values and the Decision Boundaries below look like overfitting to you?

In [None]:
clf_sepal.score(X_train_sepal_only, y_train)

In [None]:
clf_sepal.score(X_test_sepal_only, y_test)

In [None]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train,
               title="Decision Boundaries for Training Data")
# plt.savefig('ML_0202.png', bbox_inches='tight')

In [None]:
plotPrediction(clf_sepal, X_test_sepal_only[:, 0], X_test_sepal_only[:, 1],
               'Sepal length', 'Sepal width', y_test,
               title="Decision Boundaries for Test Data")
# plt.savefig('ML_0203.png', bbox_inches='tight')

### kNN with k = 10 for sepal features

In [None]:
clf_sepal_10 = neighbors.KNeighborsClassifier(50)
clf_sepal_10.fit(X_train_sepal_only, y_train)

#### Low classification accuracy for the training data but low accuracy for the test data. This model might be too simple. 

#### Experiment yourself. Report the best training accuracy and the corresponding k in the chat window.

In [None]:
clf_sepal_10.score(X_train_sepal_only, y_train)

In [None]:
clf_sepal_10.score(X_test_sepal_only, y_test)

In [None]:
plotPrediction(clf_sepal_10, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train,
               title="Decision boundary for Training Data")
# plt.savefig('ML_0204.png', bbox_inches='tight')

### kNN for the Petal Features

In [None]:
X_train_petal_only = X_train[:, 2:]
X_test_petal_only = X_test[:, 2:]

In [None]:
clf_petal_10 = neighbors.KNeighborsClassifier(10)
clf_petal_10.fit(X_train_petal_only, y_train)

In [None]:
clf_petal_10.score(X_train_petal_only, y_train)

In [None]:
clf_petal_10.score(X_test_petal_only, y_test)

In [None]:
plotPrediction(clf_petal_10, X_train_petal_only[:, 0], X_train_petal_only[:, 1], 
               'Petal length', 'Petal width', y_train,
               title="Decision boundary for Training Data")
# plt.savefig('ML_0205.png', bbox_inches='tight')

In [None]:
plotPrediction(clf_petal_10, X_test_petal_only[:, 0], X_test_petal_only[:, 1], 
               'Petal length', 'Petal width', y_test,
               title="Decision boundary for Test Data")
# plt.savefig('ML_0206.png', bbox_inches='tight')

#### Conclusion: the results are looking much better for petal than for sepal. Obviously, picking the correct feature is key to success.