In [2]:
# Import modules
import numpy as np
import matplotlib.pyplot as plt
from scipy import special
from sklearn.linear_model import LogisticRegression
import math


# Load the data sets
D_red   = np.loadtxt(open("winequality-red.csv"), delimiter = ";", skiprows = 1)
D_white = np.loadtxt(open("winequality-white.csv"), delimiter = ";", skiprows = 1)


red_cols = open("winequality-red.csv", "r").readline().replace("\n", "").replace('"', '').split(";")
white_cols = open("winequality-white.csv", "r").readline().replace("\n", "").replace('"', '').split(";")


# Shuffle the datasets
np.random.shuffle(D_red)
np.random.shuffle(D_white)


# 75% train, 25% test
D_red_train   = D_red[:1200]
D_red_test    = D_red[1200:]

D_white_train = D_white[:3675]
D_white_test  = D_white[3675:]


# Separate features and actual quality
y_red_train = D_red_train[:, 11]
D_red_train = np.delete(D_red_train, 11, 1)
y_red_test  = D_red_test[:, 11]
D_red_test  = np.delete(D_red_test, 11, 1)

y_white_train = D_white_train[:, 11]
D_white_train = np.delete(D_white_train, 11, 1)
y_white_test  = D_white_test[:, 11]
D_white_test  = np.delete(D_white_test, 11, 1)

# Check shapes of data frames
print("Red train D:  ", D_red_train.shape)
print("Red train y:  ", y_red_train.shape)
print("Red test D:   ", D_red_test.shape)
print("Red test y:   ", y_red_test.shape)

print("White train D:", D_white_train.shape)
print("White train y:", y_white_train.shape)
print("White test D: ", D_white_test.shape)
print("White test y: ", y_white_test.shape)

print("Column headers:", red_cols)

Red train D:   (1200, 11)
Red train y:   (1200,)
Red test D:    (399, 11)
Red test y:    (399,)
White train D: (3675, 11)
White train y: (3675,)
White test D:  (1223, 11)
White test y:  (1223,)
Column headers: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Standardize the features for both datasets (helps improve model performance)
scaler = StandardScaler()
X_red_train = scaler.fit_transform(D_red_train)
X_red_test = scaler.transform(D_red_test)
X_white_train = scaler.fit_transform(D_white_train)
X_white_test = scaler.transform(D_white_test)

# Convert quality ratings to binary classification (0 = 7 or below, 1 = above 7)
# For red wine
y_red_train_binary = (y_red_train > 6).astype(int)
y_red_test_binary = (y_red_test > 6).astype(int)

# For white wine
y_white_train_binary = (y_white_train > 6).astype(int)
y_white_test_binary = (y_white_test > 6).astype(int)

# Create new logistic regression models for binary classification
model_red_binary = LogisticRegression(max_iter=500)
model_white_binary = LogisticRegression(max_iter=500)

# Train the binary models
model_red_binary.fit(X_red_train, y_red_train_binary)
model_white_binary.fit(X_white_train, y_white_train_binary)

# Make predictions on the test set for binary classification
y_red_pred_binary = model_red_binary.predict(X_red_test)
y_white_pred_binary = model_white_binary.predict(X_white_test)

In [4]:
# Calculate and display performance metrics for binary classification
red_accuracy_binary = accuracy_score(y_red_test_binary, y_red_pred_binary)
white_accuracy_binary = accuracy_score(y_white_test_binary, y_white_pred_binary)
red_report_binary = classification_report(y_red_test_binary, y_red_pred_binary)
white_report_binary = classification_report(y_white_test_binary, y_white_pred_binary)

red_attr_weights = {}
white_attr_weights = {}
avg_attr_weights = {}
for i in range(len(red_cols) - 1):
    red_attr_weights[red_cols[i]] = round(model_red_binary.coef_[0][i], 3)
    white_attr_weights[white_cols[i]] = round(model_white_binary.coef_[0][i], 3)    
    avg_attr_weights[red_cols[i]] = (red_attr_weights[red_cols[i]] + white_attr_weights[red_cols[i]]) / 2

print("RED WINE BINARY CLASSIFICATION")
print("    Accuracy:", red_accuracy_binary)
print("    Report:\n" + str(red_report_binary))
print("    Confusion matrix:\n" + str(confusion_matrix(y_red_test_binary, y_red_pred_binary)))
print("    Attribute weights:")
for i in sorted(red_attr_weights.items(), key=lambda x: -(x[1])):
    print("        " + str(i[1]), "\t-", i[0])

print()
print("WHITE WINE BINARY CLASSIFICATION")
print("    Accuracy:", white_accuracy_binary)
print("    Report:\n" + str(white_report_binary))
print("    Confusion matrix:\n" + str(confusion_matrix(y_white_test_binary, y_white_pred_binary)))
print("    Attribute weights:")
for i in sorted(white_attr_weights.items(), key=lambda x: -(x[1])):
    print("        " + str(i[1]), "\t-", i[0])

print()
print("AVG ATTRIBUTE WEIGHTS")
for i in sorted(avg_attr_weights.items(), key=lambda x: -x[1]):
    print("        " + str(round(i[1], 3)), "\t-", i[0])

RED WINE BINARY CLASSIFICATION
    Accuracy: 0.8596491228070176
    Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92       335
           1       0.65      0.27      0.38        64

    accuracy                           0.86       399
   macro avg       0.76      0.62      0.65       399
weighted avg       0.84      0.86      0.83       399

    Confusion matrix:
[[326   9]
 [ 47  17]]
    Attribute weights:
        0.752 	- alcohol
        0.704 	- sulphates
        0.617 	- fixed acidity
        0.329 	- residual sugar
        0.188 	- pH
        0.173 	- citric acid
        0.17 	- free sulfur dioxide
        -0.344 	- chlorides
        -0.406 	- volatile acidity
        -0.591 	- density
        -0.666 	- total sulfur dioxide

WHITE WINE BINARY CLASSIFICATION
    Accuracy: 0.8086672117743254
    Report:
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       985
           1 

# Initial Questions
### 1. Which attributes affect the quality rating more than others?



### 2. Given its attributes, is it possible to predict if a wine will be "high" quality (>7)?

Originally, we wanted to see if we could predict the exact rating of a wine, but this quickly proved to be tough. Our model wasn't yielding high accuracy (~40%). This could be due to a number of reasons, but the most probable is that the data may be too noisy for a relatively simple model to predict the quality with high accuracy. 

We then tried putting the ratings in "buckets" and have the model try to predict the score within a range. The buckets were quality scores of 5 and below, 6 or 7, and 8 and above. This is due to the fact that the largest concentration of quality scores in the dataset are in the 6-8 range. This approach yielded slightly higher accuracy than the first idea, getting closer to 60%.

Next, we decided to try to predict if a quality score will be higher than a 7. This would simplify the work the model has to do by giving it a binary classification problem. Using this approach, our model was able to reach 98.7% accuracy with the red wine dataset (1,200 training points, 399 testing points), and 97.9% accuracy with the white wine dataset (3,675 training points, 1,223 testing points). The only problem, was that the model wasn't actually doing any guessing. Due to the nature of the dataset, it just so happens that 98% of the data points have a quality rating of 7 or less, so the model was simply picking 7 or less for everything and managing to get 98% accurate. The white wine model picked 8 or higher twice, and the red wine model never picked 8 or higher at all.

Finally, to have a model that actually made decisions, we lowered the bar for binary classification by having the model predict if the quality score would be higher than a 6. This lowered the accuracy from the previous model (**90.0%** for red wine, **78.5%** for white wine), but at least the model was actually trying to predict when the quality rating was 7 or greater.

# Conclusion

Final accuracy on testing data: **90.0%** for red wine, **78.5%** for white wine

Given the features of red wine, we are able to accurately predict if the quality rating of the wine is going greater than a 6. 

Given the features of white wine, we are able to *relatively* accurately predict if the quality rating of the wine is going greater than a 6. 

This can be used to identify common characteristics of quality wine, and to give a sense of if a new wine will be good before tasting it.

In [19]:
# !pip install imblearn
# !pip install xgboost

# FOR MAC USERS
# !brew install libomp

In [10]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Step 1: Apply SMOTE to balance the training datasets
smote = SMOTE(random_state=42)

# For red wine
X_red_train_balanced, y_red_train_balanced = smote.fit_resample(X_red_train, y_red_train_binary)

# For white wine
X_white_train_balanced, y_white_train_balanced = smote.fit_resample(X_white_train, y_white_train_binary)

# Step 2: Train ensemble classifiers
# Random Forest for red wine
rf_model_red = RandomForestClassifier(random_state=42)
rf_model_red.fit(X_red_train_balanced, y_red_train_balanced)

# Random Forest for white wine
rf_model_white = RandomForestClassifier(random_state=42)
rf_model_white.fit(X_white_train_balanced, y_white_train_balanced)

# Predictions on the test set
y_red_pred_rf = rf_model_red.predict(X_red_test)
y_white_pred_rf = rf_model_white.predict(X_white_test)

# Calculate accuracy and classification reports for the Random Forest models
red_rf_accuracy = accuracy_score(y_red_test_binary, y_red_pred_rf)
white_rf_accuracy = accuracy_score(y_white_test_binary, y_white_pred_rf)
red_rf_report = classification_report(y_red_test_binary, y_red_pred_rf)
white_rf_report = classification_report(y_white_test_binary, y_white_pred_rf)

print("RED WINE RANDOM FOREST", red_rf_accuracy)
print(red_rf_report)
print("WHITE WINE RANDOM FOREST", white_rf_accuracy)
print(white_rf_report)

RED WINE RANDOM FOREST 0.8847117794486216
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       335
           1       0.67      0.56      0.61        64

    accuracy                           0.88       399
   macro avg       0.79      0.75      0.77       399
weighted avg       0.88      0.88      0.88       399

WHITE WINE RANDOM FOREST 0.8626328699918234
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       985
           1       0.63      0.71      0.67       238

    accuracy                           0.86      1223
   macro avg       0.78      0.81      0.79      1223
weighted avg       0.87      0.86      0.87      1223



In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up GridSearchCV for the red wine Random Forest model
grid_search_red = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='f1',  # Focus on F1 score for balanced evaluation
    n_jobs=-1
)

# Fit the model for red wine
grid_search_red.fit(X_red_train_balanced, y_red_train_balanced)

# Set up GridSearchCV for the white wine Random Forest model
grid_search_white = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit the model for white wine
grid_search_white.fit(X_white_train_balanced, y_white_train_balanced)

# Best parameters and results
print("Best parameters for red wine:", grid_search_red.best_params_)
print("Best F1 score for red wine:", grid_search_red.best_score_)

print("Best parameters for white wine:", grid_search_white.best_params_)
print("Best F1 score for white wine:", grid_search_white.best_score_)

# Evaluate the best models on the test set
best_rf_red = grid_search_red.best_estimator_
best_rf_white = grid_search_white.best_estimator_

y_red_pred_best_rf = best_rf_red.predict(X_red_test)
y_white_pred_best_rf = best_rf_white.predict(X_white_test)

# Print performance metrics
print("RED WINE BEST RANDOM FOREST", accuracy_score(y_red_test_binary, y_red_pred_best_rf))
print(classification_report(y_red_test_binary, y_red_pred_best_rf))

print("WHITE WINE BEST RANDOM FOREST", accuracy_score(y_white_test_binary, y_white_pred_best_rf))
print(classification_report(y_white_test_binary, y_white_pred_best_rf))


Best parameters for red wine: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best F1 score for red wine: 0.9329177079559511
Best parameters for white wine: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best F1 score for white wine: 0.9111301398792945
RED WINE BEST RANDOM FOREST 0.8822055137844611
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       335
           1       0.66      0.55      0.60        64

    accuracy                           0.88       399
   macro avg       0.79      0.75      0.76       399
weighted avg       0.88      0.88      0.88       399

WHITE WINE BEST RANDOM FOREST 0.8626328699918234
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       985
           1       0.63      0.73      0.67       238

    accuracy                           0.86      1223
   macro avg       0.78  

In [20]:
# Import necessary libraries
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'subsample': [0.7, 0.9, 1.0]
}

# Step 1: Set up and run GridSearchCV for red wine XGBoost model
grid_search_xgb_red = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid_xgb,
    cv=3,  # 3-fold cross-validation
    scoring='f1',
    n_jobs=-1
)

# Fit the model for red wine
grid_search_xgb_red.fit(X_red_train_balanced, y_red_train_balanced)

# Step 2: Set up and run GridSearchCV for white wine XGBoost model
grid_search_xgb_white = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_grid=param_grid_xgb,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

# Fit the model for white wine
grid_search_xgb_white.fit(X_white_train_balanced, y_white_train_balanced)

# Step 3: Get the best parameters and results
print("Best parameters for red wine:", grid_search_xgb_red.best_params_)
print("Best F1 score for red wine:", grid_search_xgb_red.best_score_)

print("Best parameters for white wine:", grid_search_xgb_white.best_params_)
print("Best F1 score for white wine:", grid_search_xgb_white.best_score_)

# Step 4: Evaluate the best models on the test set
best_xgb_red = grid_search_xgb_red.best_estimator_
best_xgb_white = grid_search_xgb_white.best_estimator_

# Make predictions
y_red_pred_best_xgb = best_xgb_red.predict(X_red_test)
y_white_pred_best_xgb = best_xgb_white.predict(X_white_test)

# Calculate and print accuracy and classification reports
print("RED WINE BEST XGBOOST MODEL ACCURACY:", accuracy_score(y_red_test_binary, y_red_pred_best_xgb))
print(classification_report(y_red_test_binary, y_red_pred_best_xgb))

print("WHITE WINE BEST XGBOOST MODEL ACCURACY:", accuracy_score(y_white_test_binary, y_white_pred_best_xgb))
print(classification_report(y_white_test_binary, y_white_pred_best_xgb))


KeyboardInterrupt: 