# Revisiting our Classification Example from Week01

## The data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Side note:
* I will be posting data files and shared notebooks in a GitHub repository: https://github.com/benjum/UCLAX-25Summer-ML

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/benjum/UCLAX-25Summer-ML/main/Data/gdp-vs-lifesatisfaction.csv')

In [None]:
data

In [None]:
data = data.to_numpy()

In [None]:
data

In [None]:
# Assign the appropriate elements to x and y
x = data[:,1]
y = data[:,2]

In [None]:
# Make a scatter plot

plt.plot(x,y,'ko')
plt.show()

## Scikit-learn

* Scikit-Learn docs: https://scikit-learn.org/stable/index.html
* K-Nearest Neighbors Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* Logistic Regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Classification

Classification is appropriate when the target variable has a discrete set of values, not a continuous set.  In order to illustrate classification with this data, I am going to convert it into True/False (1/0) values.  

In [None]:
y

In [None]:
y >= 6.5

In [None]:
y = [1 if (i >= 6.5) else 0 for i in y ]

In [None]:
y

## K-Nearest Neighbors Classification

In [None]:
import sklearn.neighbors

In [None]:
# Classifier
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3)

In [None]:
# Reshape our data to be 2D rather than 1D
# (only needed if our feature array starts out as 1D)
x = x.reshape(-1,1)

In [None]:
# Train the model
model.fit(x,y)

In [None]:
# Make a prediction
x_test = [[25000]]
model.predict(x_test)

In [None]:
# Visualize what the predictions are for this model

plt.plot(x, y, 'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

## Classification's performance metrics

In [None]:
model.score(x, y)

In [None]:
# If the model correctly classifies i points and misclassifies j points out of k total
# the score should be i/k
28/29

The above is termed the accuracy.  By default, each algorithm's "score" method may be a little different:
* [`score` for KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.score)

In [None]:
# Note that when calculating the precision and recall here, if your classes are not 0/1
# you will need to specify what class is positive vs negative (the "pos_label")

print(f"Accuracy: {sklearn.metrics.accuracy_score(y, model.predict(x)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y, model.predict(x), pos_label=1):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y, model.predict(x), pos_label=1):.2%}")

You can get more information on the model performance with a confusion matrix. 

In the case of binary classification, the confusion matrix shows true negatives, true positives, false positives, and false positives.

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y, model.predict(x))

And with the classification report, we can see the precision and recall (and other scores) broken down according to class.

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, model.predict(x)))

We discussed precision and recall for binary classification, but for multi-class classification problems, these metrics can be computed in slightly different ways depending on how one does averaging. 

A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally), a weighted average will compute the metric independently for each class but then additionally take into account the support when calculating the overall average, and a micro-average will aggregate the contributions of all classes to compute the average metric. 

In a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance (i.e you may have many more examples of one class than of other classes).

# Classification based on Logistic Regression

In [None]:
import sklearn.linear_model

In [None]:
# Classifier with logistic regression
model = sklearn.linear_model.LogisticRegression()

In [None]:
# Train the model
model.fit(x,y)

In [None]:
# Visualize what the predictions are for this model

plt.plot(x, y, 'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

Note what is different about the above prediction curve relative to that for KNN Classification.  Can you make them more similar?

## Learned model

With logistic regression, the optimal values for coefficients of a specific model equation have been learned.

In [None]:
# For this model, we can actually retrieve parameters for our model equation
print(model.coef_, model.intercept_)

In [None]:
model.classes_

What's the `intercept_` and `coef_` for a logistic model?

$$f(x) = \frac{1}{1+e^{-(a_0 + a_1 x)}}$$

In [None]:
# Visualize what the predictions are for this model

plt.plot(x,y,'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.show()

The learned model $f(x)$ gives us a probability of belonging to the "positive" class, and we can take $f(x) > 0.5$, for example, to be a threshold for classifying as one class vs another. 

In [None]:
# Visualize what the predictions are for this model

plt.plot(x,y,'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.axhline(0.5,color='r',linestyle='--')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

## Classification's performance metrics

In [None]:
model.score(x, y)

In [None]:
# If the model correctly classifies i points and misclassifies j points out of k total
# the score should be i/k
27/29

The above is termed the accuracy.  By default, each algorithm's "score" method may be a little different:
* [`score` for KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.score)

In [None]:
# Note that when calculating the precision and recall here, if your classes are not 0/1
# you will need to specify what class is positive vs negative (the "pos_label")

print(f"Accuracy: {sklearn.metrics.accuracy_score(y, model.predict(x)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y, model.predict(x), pos_label=1):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y, model.predict(x), pos_label=1):.2%}")

You can get more information on the model performance with a confusion matrix. 

In the case of binary classification, the confusion matrix shows true negatives, true positives, false positives, and false positives.

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y, model.predict(x))

And with the classification report, we can see the precision and recall (and other scores) broken down according to class.

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, model.predict(x)))

## Brining in last week's content:  Test/Train Split

With logistic regression, the optimal values for coefficients of a specific model equation have been learned.

## Train/test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=0)

In [None]:
# Classifier with logistic regression
model = sklearn.linear_model.LogisticRegression()

# Train the model with x_train and y_train
model.fit(x_train,y_train)

# Visualize what the predictions are for this model

plt.plot(x_train, y_train, 'ko')
plt.plot(x_test, y_test, 'bo')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

# Assess performance with x_test and y_test
print(classification_report(y_test, model.predict(x_test)))

## Hyperparameters

In [None]:
# Classifier with logistic regression
model = sklearn.linear_model.LogisticRegression(solver = 'newton-cholesky', 
                                                C = 1e-10)

# Train the model with x_train and y_train
model.fit(x_train,y_train)

# Visualize what the predictions are for this model

plt.plot(x_train, y_train, 'ko')
plt.plot(x_test, y_test, 'bo')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

# Assess performance with x_test and y_test
print(classification_report(y_test, model.predict(x_test)))

## Visualizing the effect of regularization

In [None]:
import ipywidgets

In [None]:
def plot2models(c = 0):
    model0 = sklearn.linear_model.LogisticRegression()
    model1 = sklearn.linear_model.LogisticRegression(solver = 'newton-cholesky', 
                                                    C = 10**(-c))
    
    model0.fit(x_train,y_train)
    model1.fit(x_train,y_train)
    
    # Visualize what the predictions are for this model
    
    plt.plot(x_train,y_train,'ko')
    plt.plot(x_test,y_test,'bo')
    
    x_new = np.linspace(8000,58000,100000)
    x_new = x_new.reshape(-1,1)
    
    # the predicted y values are now from a model equation, 
    # not from results of calling the predict function
    y_model0 = 1 / (1 + np.exp(-(model0.intercept_ + model0.coef_ * x_new)))
    y_model1 = 1 / (1 + np.exp(-(model1.intercept_ + model1.coef_ * x_new)))
    
    plt.plot(x_new, y_model0, 'r')
    plt.plot(x_new, y_model1, 'g')
    
    plt.axhline(0.5,color='r',linestyle='--')
    
    x_new = np.linspace(8000,58000,100000)
    x_new = x_new.reshape(-1,1)
    y_pred = model1.predict(x_new)
    plt.plot(x_new, y_pred)
    
    plt.show()

ipywidgets.interact(plot2models, c=(1,10));