# ML - Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn import datasets

sns.set_theme(style="whitegrid")

## Motivation

Space Shuttle Challenger Disaster

<p><img src="https://upload.wikimedia.org/wikipedia/commons/9/9f/Challenger_explosion.jpg" alt="Challenger explosion.jpg" height="720" width="889"></a><br>By Kennedy Space Center</p>

In [None]:
# filepath = Path().resolve().parent / "data" / "challenger.txt"  # If you are running locally
filepath = "https://raw.githubusercontent.com/aoguedao/gmu_casbbi_data_science/main/data/Challenger.txt"
challenger = pd.DataFrame(
    np.loadtxt(filepath, skiprows=1).astype(int),
    columns=["temp_f", "nm_bad_rings"]
)
challenger.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(
    x="temp_f",
    y="nm_bad_rings",
    data=challenger,
    ax=ax
)
ax.set_title("Number of bad rings vs temperature")
fig.show()


In [None]:
challenger = challenger.assign(
    failure=lambda x: x["nm_bad_rings"].ne(0),
    is_failure=lambda x: x["failure"].astype(int),
)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(
    x="temp_f",
    y="is_failure",
    hue="failure",
    data=challenger,
    ax=ax
)
ax.set_title("Number of bad rings vs temperature")
fig.show()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X = challenger[["temp_f"]]
y = challenger["is_failure"]
model = LogisticRegression()
model.fit(X, y)

In [None]:
model.coef_.T

In [None]:
%%timeit
LogisticRegression().fit(X, y)

In [None]:
model.score(X, y)

In [None]:
# Predicting values
y_pred = model.predict(X)
y_pred

## Multi-Label Classification

In [None]:
digits_X, digits_y = datasets.load_digits(return_X_y=True, as_frame=True)
digits = pd.concat([digits_X, digits_y], axis=1)
digits.head()

In [None]:
sns.set_style("white")

In [None]:
digit_images = datasets.load_digits().images
i = 42
plt.imshow(digit_images[i], cmap=plt.cm.gray)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(digits_X, digits_y)

In [None]:
model.predict(digits_X.loc[[i], :])

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(digits_X)
confusion_matrix(digits_y, y_pred, labels=model.classes_)

In [None]:
ConfusionMatrixDisplay.from_estimator(model, digits_X, digits_y)

In [None]:
from sklearn.metrics import classification_report

y_true = digits_y.values
y_pred = model.predict(digits_X)
print(
    classification_report(
        y_true,
        y_pred,
        target_names=[str(x) for x in model.classes_]
    )
)

## K Nearest Neighbors

K Nearest Neighbors (kNN) is a non-parametric algorithm. Once the hyperparameter $k$ has been fixed, there are no more parameters. The idea is simple: the output label is the most common label among the 𝑘 nearest neighbors. In the following example, if $k=3$ the green circle is labeled as red, but if $k=5$ then it is labeled as blue.

![KNN](../images/KnnClassification.png)

[Source](https://commons.wikimedia.org/wiki/File:KnnClassification.svg)

The algorithm is really simple. The training phase consists only of storing the feature matrix and its labels.

For the prediction phase we need to compute the distance with every training vector and then find the nearest neighbors.

![kNN Algorithm](../images/knn_algorithm.jpg)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(digits_X, digits_y)

In [None]:
ConfusionMatrixDisplay.from_estimator(knn, digits_X, digits_y)