## Classification with wine quality dataset

In [None]:
import pandas as pd 

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

df = pd.read_csv(url, sep=';')

df.head(10)

In [None]:
# Defining a binary quality Q:
# Q=1 when quality > 5 and is considered as high quality
# Q=0 when quality <=5 and is considered as low quality
df['Q'] = 0
df.loc[df['quality'] > 5, 'Q'] = 1
df.drop(columns='quality', inplace=True)

df.head(20)

In [None]:
df.corr()

In [None]:
# Let's choose the two most correlating features for Q: alcohol and volatile acidity

## Logistic Regression
Despite its name, is not a regression algorithm. It is a classification algorithm based on a logistic function

L(S) = 1 / (1 + e ^ (-S))

In [None]:
import numpy as np

def logistic(s): 
    return 1. / (1 + np.exp(-s))

In [None]:
import numpy as np
xs = np.arange(-10, 10, 0.1)
plt.plot(xs, logistic(xs));

Therefore whatever number you pass to logistic function, it will clamp it between 0 and 1
In the case of a binary classification, you can interpret this behavior as follows:

- output of 0: negative
- output of 1: positive
- in between: the probability of positive

In Logistic Regression, the s is typically a linear combination of features:

s = b + w0 X0 + w1 X1 + ...

loss function: https://developers.google.com/machine-learning/crash-course/logistic-regression/model-training

Take the data set below:

In [None]:
# Generating a random datasets with two features and a binary class target using scikit-learn datasets module
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples = 500,
    n_features = 2,
    n_informative=2,
    n_redundant=0,
    n_repeated = 0,
    n_classes = 2,
    random_state = 6,
    class_sep = 1.1,
    n_clusters_per_class=1,
    shift=[10, 10],
    scale=[1, 2]
)

# print(X[:5])
# print(y[:5])

In [None]:
import matplotlib.pyplot as plt


p = X[y == 1]
n = X[y == 0]

plt.scatter(p[:, 0], p[:, 1], color='blue', label='Positive')
plt.scatter(n[:, 0], n[:, 1], color='orange', label='Negative');

plt.xlabel('X0');plt.ylabel('X1');plt.legend();

In [None]:
from sklearn.preprocessing import StandardScaler

# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=46)

# Using a scaler object to bring all features in the same range
scaler = StandardScaler().fit(X_train)

# Try plotting the data points above with this scaled version of data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creating and training the classifier
clf = linear_model.LogisticRegression().fit(X_train_scaled, y_train)

In [None]:
## Classifier training score relates to the percentage of correctly classified datapoints
clf.score(X_train_scaled, y_train)

In [None]:
## Classifier test score
clf.score(X_test_scaled, y_test)

In [None]:
# Coefficients
W = clf.coef_[0]
b = clf.intercept_[0]
print(f'W is {W}')
print(f'b is {b}')

### Decision boundary
is the line separating these two regions where the logistic is 0.5 which means s = 0

0 = b + W[0] * X[0] + W[1] * X[1]

in terms of the plot:
X[1] = -(b + W[0] * X[0]) / W[1]

In [None]:
# Decision Boundary line equation using lambda function

DBLine = lambda x0: -(b + W[0] * x0) / W[1]

In [None]:
p = X_train_scaled[y_train == 1]
n = X_train_scaled[y_train == 0]
plt.scatter(p[:, 0], p[:, 1], color='blue', label='Positive')
plt.scatter(n[:, 0], n[:, 1], color='orange', label='Negative')

x0Range = np.arange(-4,4,1)
plt.plot(x0Range, DBLine(x0Range), color='red')

plt.xlabel('X0');plt.ylabel('X1');plt.legend();

## Nonlinear relationships
Similar to linear regression, with logistic regression we can produce curved decision boundaries by adding more pseudo-features. To be concrete we can add higher order terms like second order of features like x0^2, x1^2 and x0*x1

Take the circular dataset below. It is not possible to efficively define separation with a line
- Try rerunning the cells above with X and y below to see how the scores and the line
- Because Logistic regression is too biased, it performs awful, we need to add variance by adding more features

In [None]:
from sklearn.datasets import make_circles

X, y = make_circles(n_samples=500, shuffle=True, noise=0.2, random_state=0, factor=.3)

p = X[y == 1]
n = X[y == 0]

plt.scatter(p[:, 0], p[:, 1], color='blue', label='Positive')
plt.scatter(n[:, 0], n[:, 1], color='orange', label='Negative')

plt.xlabel('X0');plt.ylabel('X1');plt.legend();

In [None]:
# Let's add higher order psuedo-features to the data set using numpy.stack function

X = np.stack((X[:,0], X[:,1], X[:,0]**2, X[:,1]**2, X[:,0] * X[:,1]), axis = 1)

X[0:3]

In [None]:
# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=46)

# Creating and training the classifier
clf = linear_model.LogisticRegression().fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# Coefficients
W = clf.coef_[0]
b = clf.intercept_[0]
print(f'W is {W}')
print(f'b is {b}')

In [None]:
p = X_test[y_test == 1]
n = X_test[y_test == 0]

plt.scatter(p[:, 0], p[:, 1], color='blue', label='Positive')
plt.scatter(n[:, 0], n[:, 1], color='orange', label='Negative')

# Plotting the polynomail equation using contours
delta = 0.025
xrange = np.arange(-2, 2, delta)
yrange = np.arange(-2, 2, delta)
x, y = np.meshgrid(xrange, yrange)
equation = b + W[0]*x + W[1]*y + W[2]*x**2 + W[3]*y**2 + W[4]*x*y
plt.contour(x, y, equation, levels=[0])

plt.xlabel('X0');plt.ylabel('X1');plt.legend();

## Confusion matrix
Used for analyzing classifier prediction results

In [None]:
y_true = y_test
y_true

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
cm

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(cm, [0, 1], [0, 1])
plt.figure(figsize=(5,4))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, cmap="Blues") # font size
plt.xlabel('Predicted');plt.ylabel('True value');

The link below explains the confusion matrix and alternative measures for evaluation

https://medium.com/analytics-vidhya/what-is-a-confusion-matrix-d1c0f8feda5

In [None]:
[[TN, FP], [FN, TP]] = cm

print(
f'''
True Negative: {TN}
True Positive: {TP}

False Positive: {FP} (TYPE I error)
True Negative: {FN} (TYPE II error)
'''
)

### Common evaluation metrics
Accuracy is simplay the percentage of correct predictions. This may not be a good value to evaluate a model specially if the data is not balanced

In [None]:
# Accuracy: (TP + TN) / All
acc = (TP + TN) / (TP + TN + FP + FN)
acc

In [None]:
# Precision: TP / All predicted positive -> application: Email spam or fraud transaction
precision = TP / (TP + FP)
precision

In [None]:
# Recall: TP / All true positives -> important medical test
recall = TP / (TP + FN)
recall

In [None]:
# F1 score: 2 * precision * recall / (precision + recall)
F1 = 2 * precision * recall / (precision + recall)
F1

In [None]:
# There's always a trade-off between precision and recall and you can adjust your hyperparameters to move towards one side