In [10]:
# import modules
import numpy as np
import matplotlib.pyplot as plt
from scipy import special
from sklearn.linear_model import LogisticRegression
import math


# Load the data sets
D_red   = np.loadtxt(open("winequality-red.csv"), delimiter = ';', skiprows = 1)
D_white = np.loadtxt(open("winequality-white.csv"), delimiter = ';', skiprows = 1)


# 75% train, 25% test
D_red_train   = D_red[:1200]
D_red_test    = D_red[1200:]

D_white_train = D_white[:3675]
D_white_test  = D_white[3675:]


# Separate features and actual quality
y_red_train = D_red_train[:, 11]
D_red_train = np.delete(D_red_train, 11, 1)
y_red_test  = D_red_test[:, 11]
D_red_test  = np.delete(D_red_test, 11, 1)

y_white_train = D_white_train[:, 11]
D_white_train = np.delete(D_white_train, 11, 1)
y_white_test  = D_white_test[:, 11]
D_white_test  = np.delete(D_white_test, 11, 1)


# Change quality score to a bucket   ** note: the actual qualities are actually from 3-9, but could have been 0-10
# 1: 0 <= quality <= 5
# 2: 6 <= quality <= 7
# 3: 8 <= quality <= 10
# def make_buckets(y):
#     for i in range(y.shape[0]):
#         if y[i] <= 5:
#             y[i] = 1
#         elif y[i] <= 7:
#             y[i] = 2
#         else:
#             y[i] = 3
#     return y

# y_red_train   = make_buckets(y_red_train)
# y_red_test    = make_buckets(y_red_test)
# y_white_train = make_buckets(y_white_train)
# y_white_test  = make_buckets(y_white_test)


# Add a column for the predicted quality
# D_red_train = np.append(np.ones((D_red_train.shape[0], 1)), D_red_train, axis = 1)
# D_red_test  = np.append(np.ones((D_red_test.shape[0], 1)), D_red_test, axis = 1)

# D_white_train = np.append(np.ones((D_white_train.shape[0], 1)), D_white_train, axis = 1)
# D_white_test  = np.append(np.ones((D_white_test.shape[0], 1)), D_white_test, axis = 1)


# Check shapes of data frames
print("Red train D:  ", D_red_train.shape)
print("Red train y:  ", y_red_train.shape)
print("Red test D:   ", D_red_test.shape)
print("Red test y:   ", y_red_test.shape)

print("White train D:", D_white_train.shape)
print("White train y:", y_white_train.shape)
print("White test D: ", D_white_test.shape)
print("White test y: ", y_white_test.shape)

Red train D:   (1200, 11)
Red train y:   (1200,)
Red test D:    (399, 11)
Red test y:    (399,)
White train D: (3675, 11)
White train y: (3675,)
White test D:  (1223, 11)
White test y:  (1223,)


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Standardize the features for both datasets (helps improve model performance)
scaler = StandardScaler()
X_red_train = scaler.fit_transform(D_red_train)
X_red_test = scaler.transform(D_red_test)
X_white_train = scaler.fit_transform(D_white_train)
X_white_test = scaler.transform(D_white_test)

# Convert quality ratings to binary classification (0 = 7 or below, 1 = above 7)
# For red wine
y_red_train_binary = (y_red_train > 7).astype(int)
y_red_test_binary = (y_red_test > 7).astype(int)

# For white wine
y_white_train_binary = (y_white_train > 7).astype(int)
y_white_test_binary = (y_white_test > 7).astype(int)

# Create new logistic regression models for binary classification
model_red_binary = LogisticRegression(max_iter=500)
model_white_binary = LogisticRegression(max_iter=500)

# Train the binary models
model_red_binary.fit(X_red_train, y_red_train_binary)
model_white_binary.fit(X_white_train, y_white_train_binary)

# Make predictions on the test set for binary classification
y_red_pred_binary = model_red_binary.predict(X_red_test)
y_white_pred_binary = model_white_binary.predict(X_white_test)

# Calculate and display performance metrics for binary classification
red_accuracy_binary = accuracy_score(y_red_test_binary, y_red_pred_binary)
white_accuracy_binary = accuracy_score(y_white_test_binary, y_white_pred_binary)
red_report_binary = classification_report(y_red_test_binary, y_red_pred_binary)
white_report_binary = classification_report(y_white_test_binary, y_white_pred_binary)

#red_accuracy_binary, white_accuracy_binary, red_report_binary, white_report_binary

print("Red wine binary classification accuracy: ", red_accuracy_binary)
print("White wine binary classification accuracy: ", white_accuracy_binary)
print("Red wine binary classification report: ", red_report_binary)
print("White wine binary classification report: ", white_report_binary)

Red wine binary classification accuracy:  0.9874686716791979
White wine binary classification accuracy:  0.9787408013082584
Red wine binary classification report:                precision    recall  f1-score   support

           0       0.99      1.00      0.99       394
           1       0.00      0.00      0.00         5

    accuracy                           0.99       399
   macro avg       0.49      0.50      0.50       399
weighted avg       0.98      0.99      0.98       399

White wine binary classification report:                precision    recall  f1-score   support

           0       0.98      1.00      0.99      1199
           1       0.00      0.00      0.00        24

    accuracy                           0.98      1223
   macro avg       0.49      0.50      0.49      1223
weighted avg       0.96      0.98      0.97      1223



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Conclusion

We learned with 98% accuracy