In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from IPython.display import Image

In [None]:
df_red = pd.read_csv('../data/winequality-red.csv', delimiter=';')
df_white = pd.read_csv('../data/winequality-white.csv', delimiter=';')

df_red['type'] = 'red'
print(str(df_red.shape[0]) + ' red wines.')

df_white['type'] = 'white'
print(str(df_white.shape[0]) + ' white wines.')

df = pd.concat([df_red,df_white])

df.head()

In [None]:
type_model_data = df.drop('quality', axis = 1)
quality_model_data = df

y_type = (type_model_data.pop('type') == 'red')*1
y_quality = quality_model_data.pop('quality')

y_quality_7 = (y_quality>= 7)*1


X_train, X_test, y_train, y_test = train_test_split(type_model_data, y_type,
                                                    test_size=0.25,
                                                    random_state=42)



In [None]:
lr = LogisticRegression(penalty = 'none', max_iter = 5000)
lr.fit(X_train,
        y_train)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print('--Train--')
print(confusion_matrix(y_train, y_pred))
print('--Test--')
print(confusion_matrix(y_test, y_pred_test))

In [None]:
quality_model_data['type'] = (quality_model_data['type']=='red')

X_train, X_test, y_train, y_test = train_test_split(quality_model_data, y_quality_7,
                                                    test_size=0.25,
                                                    random_state=42)

lr = LogisticRegression(penalty = 'none', max_iter = 5000)
lr.fit(X_train,
        y_train)

In [None]:
y_pred = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print(' --Train--\n')
print(confusion_matrix(y_train, y_pred))
print(' --Test-- \n')
print(confusion_matrix(y_test, y_pred_test))

### Accuracy

Accuracy is the sum of the correctly predicted classes divided by the total number of observations it made a prediction for.

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred_test))

Is this a useful measure here? Why or why not?

In [None]:
print(accuracy_score(y_test, np.repeat(0,len(y_test))))

### Precision and Recall

With class imbalance, accuracy is note always a useful metric to evaluate the quality of a model. Consider the extreme case of credit card fraud. The most accurate model may be just to say nothing is fraud, but

**Recall** refers to the percentage of a class that was correctly classified as that class. $\frac{True  Positives}{True  Positive  + False  Negatives}$

**Precision** refers to the percentage of observations classified as a class, were actually that call.$\frac{True Positives}{True  Positive  + False Positives}$

**F1-Score** F1 is the harmonic mean of precision and recall. A harmonic mean is a more meaningful average for rates.

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

### Thresholding

In binary classification we do have the flexibility to say that we'll change classification thresholds. The default says if p >= .5, then call it a one.  But we can experiment with that to try and get better performance from a model.

In [None]:
y_prob = lr.predict_proba(X_train)
y_prob_test = lr.predict_proba(X_test)

y_pred_thresh = (y_prob[:,1] > .85)*1
y_pred_test_thresh = (y_prob_test[:,1] > .)*1

In [None]:
print(classification_report(y_test, y_pred_test_thresh))

In [None]:
from sklearn.metrics import plot_precision_recall_curve

plot_precision_recall_curve(lr, X_test, y_test)

### AUC-ROC Curve

Area under the curve, receiver operating characteristics is another highly important metrics to evaluate your model. This value provides information about how well the model can distinguish between classes, so a high value denotes a good model.  There is 1 ROC curve per class, but averages of them can be taken to consolidate

In [None]:
Image('../images/ROCAUC.png')

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_test, y_prob_test[:,1])
fpr0, tpr0, _ = roc_curve(y_test, y_prob_test[:,0])


print(auc(fpr, tpr))

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

### Methods for Working with Unbalanced Data

There are many different techniques for handling an imbalance in the data. Three common techniques are class weighting, oversampling, and undersampling.

**Class weighting** lets you change how much misclassification of that class impacts the loss function. In binary classification, you it's common practice to make the majority class weight 1 and the minority class weight = 1/(% in class).

**Oversampling** involves increasing the prevalance of minority class(es) so that the dataset has roughly balanced classes. While this can be effective, it does reduce the variability of the minority class(es) and can lead to overfitting if not done carefully. SMOTE is a common approach to this.

**Undersampling** is where a random selection of the majority class(es) is taken with size roughly equal to the minority class.  If there isn't enough data available this can prove difficult.


### Class Weights

In [None]:
lr_balanced = LogisticRegression(penalty = 'none', class_weight='balanced',max_iter = 5000)

lr_balanced.fit(X_train,
        y_train)

y_pred = lr_balanced.predict(X_train)
y_pred_test = lr_balanced.predict(X_test)

print(classification_report(y_test, y_pred_test))

### Oversampling

In [None]:
delta = len(y_train)-sum(y_train)
print(delta)
df_over_sampled = X_train[y_train==1].sample(delta, random_state=42, replace = True)

X_train_over = pd.concat([X_train, df_over_sampled])
y_train_over = pd.concat([y_train, pd.Series(np.repeat(1,delta))])

In [None]:
lr_over = LogisticRegression(penalty = 'none', class_weight='balanced',max_iter = 5000)

lr_over.fit(X_train_over,
        y_train_over)

y_pred = lr_over.predict(X_train)
y_pred_test = lr_over.predict(X_test)

print(classification_report(y_test, y_pred_test))

### Undersampling

In [None]:
num_minority = sum(y_train)
df_under_sampled = X_train[y_train==0].sample(num_minority, random_state=42, replace = True)
df_under_sampled_min = X_train[y_train==1]


X_train_under = pd.concat([df_under_sampled, df_under_sampled_min])
y_train_under = pd.concat([pd.Series(np.repeat(0,num_minority)), pd.Series(np.repeat(1,num_minority))])

In [None]:
lr_over = LogisticRegression(penalty = 'none', class_weight='balanced',max_iter = 5000)

lr_over.fit(X_train_under,
        y_train_under)

y_pred = lr_over.predict(X_train)
y_pred_test = lr_over.predict(X_test)

print(classification_report(y_test, y_pred_test))