# Classification based on Logistic Regression

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/benjum/UCLAX-23W-ML/main/Weeks/Week03/data/gdp-vs-lifesatisfaction-classes.csv')

In [None]:
data

In [None]:
data.plot.scatter(x='GDP per capita', y='Life satisfied')
# or equally well
# data.plot(x='GDP per capita', y='Life satisfied', kind='scatter')

## Scikit-learn

<img src="images/scikit-learn.png" width=500>

https://scikit-learn.org/stable/index.html
<br>
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

## logistic regression

In [None]:
import sklearn.linear_model

In [None]:
# Before with linear regression
# model = sklearn.linear_model.LinearRegression()

# Classifier
model = sklearn.linear_model.LogisticRegression()

In [None]:
# Another technical note like before
# sklearn will expect x to be like a 2D array 
# which in Pandas means like a dataframe rather than a series
# We make a dataframe by indexing the dataframe with a list containing our column names

x = data[['GDP per capita']]
y = data['Life satisfied']

In [None]:
# Train the model
model.fit(x,y)

In [None]:
# Make a prediction
x_test = [[25000]]
model.predict(x_test)

In [None]:
# Visualize what the predictions are for this model

data.plot.scatter(x='GDP per capita', y='Life satisfied')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

## Learned model

With logistic regression, let's get a little bit more clear on what the learned model is.

In [None]:
model.classes_

In [None]:
# For this model, we can actually retrieve parameters for our model equation
print(model.coef_, model.intercept_)

What's the intercept and slope for a logistic model?

$$f(x) = \frac{1}{1+e^{-(a_0 + a_1 x)}}$$

In [None]:
# Visualize what the predictions are for this model

plt.plot(x,y,'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.show()

The learned model $f(x)$ gives us a probability of belonging to the "positive" class, and we can take $f(x) > 0.5$, for example, to be a threshold for classifying as one class vs another. 

In [None]:
# Visualize what the predictions are for this model

data.plot.scatter(x='GDP per capita', y='Life satisfied')
#plt.plot(x,y,'ko')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.axhline(0.5,color='r',linestyle='--')

x_new = np.linspace(8000,58000,100000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

## Ascertaining the "goodness" of the model fit

In [None]:
model.score(x, y)

In [None]:
# If the model correctly classifies i points and misclassifies j points out of k total
# the score should be i/k
27/29

What is the above termed?
1. accuracy
2. precision
3. recall
4. actually, none of these

In [None]:
# Note that when calculating the precision and recall here
# you will need to specify what class is positive vs negative (the "pos_label")

print(f"Accuracy: {sklearn.metrics.accuracy_score(y, model.predict(x)):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y, model.predict(x), pos_label='Satisfied'):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y, model.predict(x), pos_label='Satisfied'):.2%}")

In [None]:
from sklearn.metrics import confusion_matrix

You can get more information on the accuracy of the model with a confusion matrix. 

In the case of binary classification, the confusion matrix shows true negatives, true positives, false positives, and false positives.

In [None]:
confusion_matrix(y, model.predict(x))

## If we take "Not Satisfied" as our negative, which of the above are the 
* true negatives? -- 
* true positives? -- 
* false negatives? -- 
* false positives? -- 

In [None]:
confmat = confusion_matrix(y, model.predict(x))

fig, ax = plt.subplots(figsize=(5, 5))
ax.imshow(confmat)

# the below just sets the axis labels, tick marks, and text inside the boxes
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
for i in range(2):
    for j in range(2):
        ax.text(j, i, confmat[i,j], ha='center', va='center', color='red')

plt.show()

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y, model.predict(x)))

## Breast cancer dataset

In [None]:
import sklearn.datasets
import sklearn.model_selection

In [None]:
cancer = sklearn.datasets.load_breast_cancer()

In [None]:
print(cancer.DESCR)

In [None]:
x = cancer.data
y = cancer.target

In [None]:
plt.plot(x[:,0],y,'ko')

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=0)

In [None]:
# Before for regression:
# model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=1)

# Classifier
model = sklearn.linear_model.LogisticRegression()

In [None]:
# Train the model
model.fit(X_train[:,0].reshape(-1,1),y_train)

In [None]:
# Make a prediction
x_test = [[20]]
model.predict(x_test)

In [None]:
# Visualize what the predictions are for this model

#data.plot.scatter(x='GDP per capita', y='Life satisfied')
plt.plot(x[:,0],y,'ko')

x_new = np.linspace(0,30,1000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()

In [None]:
# Note that when calculating the precision and recall here
# you will need to specify what class is positive vs negative (the "pos_label")

print(f"Accuracy: {sklearn.metrics.accuracy_score(y_test, model.predict(X_test[:,0].reshape(-1,1))):.2%}")
print(f"Precision: {sklearn.metrics.precision_score(y_test, model.predict(X_test[:,0].reshape(-1,1)), pos_label=1):.2%}")
print(f"Recall: {sklearn.metrics.recall_score(y_test, model.predict(X_test[:,0].reshape(-1,1)), pos_label=1):.2%}")

In [None]:
from sklearn.metrics import confusion_matrix

You can get more information on the accuracy of the model with a confusion matrix. 

In the case of binary classification, the confusion matrix shows true negatives, true positives, false positives, and false positives.

In [None]:
confusion_matrix(y_test, model.predict(X_test[:,0].reshape(-1,1)))

## If we take "Not Satisfied" as our negative, which of the above are the 
* true negatives? -- 
* true positives? -- 
* false negatives? -- 
* false positives? -- 

In [None]:
confmat = confusion_matrix(y_test, model.predict(X_test[:,0].reshape(-1,1)))

fig, ax = plt.subplots(figsize=(5, 5))
ax.imshow(confmat)

# the below just sets the axis labels, tick marks, and text inside the boxes
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
for i in range(2):
    for j in range(2):
        ax.text(j, i, confmat[i,j], ha='center', va='center', color='red')

plt.show()

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, model.predict(X_test[:,0].reshape(-1,1))))

In [None]:
model.classes_

In [None]:
# For this model, we can actually retrieve parameters for our model equation
print(model.coef_, model.intercept_)

What's the intercept and slope for a logistic model?

$$f(x) = \frac{1}{1+e^{-(a_0 + a_1 x)}}$$

In [None]:
# Visualize what the predictions are for this model

plt.plot(x[:,0],y,'ko')

x_new = np.linspace(0,30,1000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.show()

The learned model $f(x)$ gives us a probability of belonging to the "positive" class, and we can take $f(x) > 0.5$, for example, to be a threshold for classifying as one class vs another. 

In [None]:
# Visualize what the predictions are for this model

plt.plot(x[:,0],y,'ko')
#plt.plot(x,y,'ko')

x_new = np.linspace(0,30,1000)
x_new = x_new.reshape(-1,1)

# the predicted y values are now from a model equation, 
# not from results of calling the predict function
y_model = 1 / (1 + np.exp(-(model.intercept_ + model.coef_ * x_new)))

plt.plot(x_new, y_model)

plt.axhline(0.5,color='r',linestyle='--')

x_new = np.linspace(0,30,1000)
x_new = x_new.reshape(-1,1)
y_pred = model.predict(x_new)
plt.plot(x_new, y_pred)

plt.show()