# Review of ML Process Covered So Far
Using K-Nearest Neighbors (KNN)

## The data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/benjum/UCLAX-24Fall-ML/main/Data/gdp-vs-lifesatisfaction.csv')

In [None]:
data

In [None]:
data['Life satisfaction'] = ['Satisfied' if i >= 6.5 else 'Not Satisfied' for i in data['Life satisfaction'] ]

In [None]:
data

In [None]:
data.plot.scatter(x='GDP per capita', y='Life satisfaction')
# or equally well
# data.plot(x='GDP per capita', y='Life satisfaction', kind='scatter')

<img src="https://scikit-learn.org/stable/_static/scikit-learn-logo-small.png" width=200>

https://scikit-learn.org/stable/index.html
<br>
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

## K-Nearest Neighbors (for classification, not regression)

In [None]:
import sklearn.neighbors

In [None]:
# Classifier using 3 nearest neighbors 
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3)

In [None]:
# Reminder:
# sklearn will expect the array of feature values to be like a 2D array 
# which in Pandas means like a dataframe rather than a series
# We make a dataframe by indexing the dataframe with a list containing our column names

x = data[['GDP per capita']]
y = data['Life satisfaction']

In [None]:
x

In [None]:
# Train the model
model.fit(x,y)

In [None]:
# Make a prediction
# If we used a dataframe for training, we'll need one to pass in our feature values
x_test = pd.DataFrame({'GDP per capita':[45000]})
model.predict(x_test)

In [None]:
# Visualize what the predictions are for this model

data.plot.scatter(x='GDP per capita', y='Life satisfaction')

x_new = np.linspace(8000,58000,10000)
x_new = pd.DataFrame({'GDP per capita':x_new})
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

## Ascertaining the "goodness" of the model fit

In [None]:
model.score(x, y)

In [None]:
# If the model correctly classifies i points and misclassifies j points out of k total
# the score should be i/k
28/29

In [None]:
# Note that when calculating the precision and recall here, if your classes are not 0/1
# you will need to specify what class is positive vs negative (the "pos_label")

print(f"Accuracy: {sklearn.metrics.accuracy_score(y, model.predict(x)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y, model.predict(x), pos_label='Satisfied'):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y, model.predict(x), pos_label='Satisfied'):.2%}")

In [None]:
from sklearn.metrics import confusion_matrix

You can get more information on the accuracy of the model with a confusion matrix. 

In the case of binary classification, the confusion matrix shows true negatives, true positives, false positives, and false positives.

In [None]:
confusion_matrix(y, model.predict(x))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, model.predict(x)))

What are these numbers?
* precision and recall for each of the "Not Satisfied" and "Satisfied"
  * 12/12 = 1.00, 12/13 = 0.92, 16/16 = 1.00, 16/17 = 0.94
* f1-score is the harmonic mean
  * 2 / (1/precision + 1/recall)
* support shows the number of data points
  * 12 and 17
* accuracy: total_correct/total -> no difference in considering the two classes
* macro avg: average considering both classification metrics equally
  * precision: 0.5*(12/13) + 0.5*(16/16) = 0.96
  * recall: 0.5*(12/12) + 0.5*(16/17) = 0.97
* weighted avg: average when weighting the classification metrics by the support 
  * precision: (12*(12/13) + 17*(16/16)) / 29 = 0.97
  * recall: (12*(12/12) + 17*(16/17)) / 29 = 0.97
* micro avg: add up the contributions of positives from each class before dividing by totals
  * precision: (12+16) / (12+1+0+16)
  * recall: (12+16) / (12+0+1+16)
  * for multi-class classification where data points have only one class, this is essentially the same as accuracy score


## Train/Test Split, Cross-Validation, and Hyperparameters

To review these concepts covered in the online materials, we'll repeat the classification process but include new elements:
* a new dataset: [Wisconsin breast cancer dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)
* a test/train split
* cross-validation to find an optimum hyperparameter value, namely the optimum number of neighbors



In [None]:
import sklearn.datasets
import sklearn.model_selection

In [None]:
x,y = sklearn.datasets.load_breast_cancer(return_X_y=True, as_frame=True)

`x` and `y` contain the feature and target values:

In [None]:
x

In [None]:
y

We'll consider using only the `mean radius` column of `x`:

In [None]:
rad = x[['mean radius']]

In [None]:
# check by executing this cell and comparing against the dataframe output above
rad

Let's do an initial exploratory data viz:

In [None]:
plt.plot(rad,y,'ko')

Before training our model, we'll hold out a set of data to use later for testing our model's performance.

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(rad, 
                                                                            y, 
                                                                            random_state=0)

Initialize our k-nearest neighbors classifier and start with n_neighbors = 3.

In [None]:
import sklearn.neighbors
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3)

Train the model.

In [None]:
model.fit(X_train, y_train)

Look at the plot above, choose a value for radius, and let's see which class our model predicts for that value.
* doing this the Panda's way vs the numpy way requires that we set up a dataframe for our feature values, rather than simply passing values or numpy arrays into the predict function.

In [None]:
x_test = pd.DataFrame({'mean radius':[20]})
model.predict(x_test)

Now let's remake the plot from above, and on top of it, plot a line curve showing the predictions of our model over the plotted horizontal range.
* you may find the following useful:
  * `np.linspace(a,b,c)` will make a numpy array with `c` elements starting at the value `a` and going to `b`
  * remember that sklearn's predict method must have a 2D-like array input, and in the context of Pandas, this will be a dataframe

In [None]:
import numpy as np

In [None]:
# Visualize what the predictions are for this model

# plt.plot(rad,y,'ko')
plt.plot(X_train,y_train,'ko')
plt.plot(X_test,y_test,'bo')

x_new = np.linspace(0,30,1000)
x_new = pd.DataFrame({'mean radius':x_new})
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

We can now assess the performance of our trained model on data that it has not seen yet, our test data.

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, model.predict(X_test)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y_test, model.predict(X_test)):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y_test, model.predict(X_test)):.2%}")

The above was performed for a specific value of n_neighbors.  Can we tell beforehand which value of n_neighbors will be a good one to use?  No.

What we can do instead is to use part of our training set to train for different values of hyperparameters, like n_neighbors.

Here we will use cross validation to identify the performance for different values of n_neighbors.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# look at n_neighbors over the range from 1 to 200
k_range = range(1, 200)

# keep track of the accuracy scores for each n_neighbors value
k_scores = []

# for every value of n_neighbors, perform 5-fold cross-validation
# using knn as the model and accuracy as the performance metric
for k in k_range:
    knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k)
    acc = cross_val_score(knn,
                          X_train,
                          y_train,
                          cv=5,
                          scoring='accuracy')
    k_scores.append(acc.mean())

# plot the accuracy scores to see which n_neighbors gives the highest accuracy    
plt.scatter(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

# explicitly print out the n_neighbors value that gives the best performance
print('Max k = ',np.argmax(k_scores)+1)

Initialize another k-nearest neighbors classifier with the best n_neighbors.

In [None]:
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=65)

Train the model.

In [None]:
model.fit(X_train,y_train)

Remake the plot that has the training points and the curve of your new trained model's predictions.

In [None]:
# Visualize what the predictions are for this model

plt.plot(X_train,y_train,'ko')

x_new = np.linspace(0,30,1000)
x_new = pd.DataFrame({'mean radius':x_new})
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

Print out the accuracy, precision, and recall assessed via the test set.

In [None]:
print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, model.predict(X_test)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y_test, model.predict(X_test)):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y_test, model.predict(X_test)):.2%}")

Print the confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, model.predict(X_test))

Print the classification report.

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, model.predict(X_test)))