# Wine Quality Prediction Using Machine Learning
### Students:

Simon Lindqvist, siln22@student.bth.se

Abdalrahman Mohammed, abmm22@student.bth.se

### Imports and setup

In [1]:
# --------- Imports ---------
import sklearn as sk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn as imb

from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE  # or use RandomOverSampler
# --------- Load Data ---------
red_wine_df = pd.read_csv('winequality-red.csv', sep=';')

### 1. Datasetp inspection

In [None]:
red_wine_df.shape

(1599, 12)

In [3]:
print("---Pandas info of dataset---")
red_wine_df.info()

---Pandas info of dataset---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
red_wine_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
print("---Unique values in each column---")
for column in red_wine_df.columns:
    print(f"Unique values in {column}: {red_wine_df[column].nunique()} of {red_wine_df.shape[0]}.    Ratio: {red_wine_df[column].nunique() / red_wine_df.shape[0]}\n")

---Unique values in each column---
Unique values in fixed acidity: 96 of 1599.    Ratio: 0.0600375234521576

Unique values in volatile acidity: 143 of 1599.    Ratio: 0.08943089430894309

Unique values in citric acid: 80 of 1599.    Ratio: 0.050031269543464665

Unique values in residual sugar: 91 of 1599.    Ratio: 0.056910569105691054

Unique values in chlorides: 153 of 1599.    Ratio: 0.09568480300187618

Unique values in free sulfur dioxide: 60 of 1599.    Ratio: 0.0375234521575985

Unique values in total sulfur dioxide: 144 of 1599.    Ratio: 0.0900562851782364

Unique values in density: 436 of 1599.    Ratio: 0.27267041901188244

Unique values in pH: 89 of 1599.    Ratio: 0.05565978736710444

Unique values in sulphates: 96 of 1599.    Ratio: 0.0600375234521576

Unique values in alcohol: 65 of 1599.    Ratio: 0.04065040650406504

Unique values in quality: 6 of 1599.    Ratio: 0.00375234521575985



### 2. Test and train data split

In [6]:
# Will use 80% of the data for training and 20% for testing
train_set = red_wine_df.sample(frac=0.8, random_state=7)
test_set = red_wine_df.drop(train_set.index)

# Will use quality as the target variable
train_set_x = train_set.drop(columns="quality")
train_set_y = train_set["quality"]

test_set_x = test_set.drop(columns="quality")
test_set_y = test_set["quality"]

### 3. Scaling of data

In [7]:
# Use MinMaxScaler to scale the data
scaler_model = MinMaxScaler()

# Fit the scaler to the training data and transform the training and testing data
scaled_train_set_x = scaler_model.fit_transform(train_set_x)
scaled_test_set_x = scaler_model.transform(test_set_x)

# Make the scaled data into a DataFrame again for easier manipulation
scaled_train_set_x = pd.DataFrame(scaled_train_set_x, columns=red_wine_df.columns[:-1])
scaled_test_set_x = pd.DataFrame(scaled_test_set_x, columns=red_wine_df.columns[:-1])

# Use describe to check if the scaling worked
scaled_train_set_x.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
count,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0,1279.0
mean,0.326567,0.279898,0.272627,0.110893,0.125953,0.209632,0.143738,0.491536,0.446498,0.195961,0.31029
std,0.155974,0.1236,0.195936,0.09511,0.079328,0.147116,0.117914,0.139932,0.1205,0.099572,0.163738
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.214286,0.184932,0.095,0.068493,0.096828,0.084507,0.056537,0.406021,0.366142,0.131737,0.169231
50%,0.285714,0.273973,0.26,0.089041,0.111853,0.183099,0.113074,0.488253,0.440945,0.173653,0.261538
75%,0.410714,0.356164,0.43,0.116438,0.130217,0.288732,0.201413,0.571953,0.519685,0.239521,0.415385
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### 4. Validation using repeated k-Fold cross-validation

In [8]:
# Create k fold cross validation object
k_fold = RepeatedKFold(n_splits=3, n_repeats=10)

# Will test random forest and suport vector machine
forest_model = RandomForestClassifier(random_state=7)
svm_model = SVC(random_state=7)

# Will use accuracy as the metric is non-binary
forest_scores = cross_val_score(forest_model, scaled_train_set_x, train_set_y, cv=k_fold, scoring="accuracy")
svm_scores = cross_val_score(svm_model, scaled_train_set_x, train_set_y, cv=k_fold, scoring="accuracy")

print(f"Random Forest, mean: {forest_scores.mean()}, std: {forest_scores.std()}")
print(f"SVM: mean, {svm_scores.mean()}, std: {svm_scores.std()}")

Random Forest, mean: 0.6496488951926496, std: 0.020975241465709015
SVM: mean, 0.5946119338984727, std: 0.018643579096376695


### 5. Build the final model

In [34]:
# Will use random forest since it has higher mean accuracy and comparable standard deviation compared to SVM in previous step
forest_model.fit(train_set_x, train_set_y)

### 6. Test model performance

In [35]:
# Use test set to evaluate the model
score = forest_model.score(scaled_test_set_x, test_set_y)

# Print the score of the model
print(f"Model score: {score}")

Model score: 0.478125


### 7. Balance scaled train set using imbalanced learn

In [20]:
# Balance the scaled data using imblearn
oversample_model = imb.over_sampling.SMOTE(sampling_strategy='auto', random_state=7)
scaled_train_set_x_balanced, train_set_y_balanced = oversample_model.fit_resample(scaled_train_set_x, train_set_y)

### 8. Perform steps 4 and 5 using balanced scaled train data

In [21]:
# Create k fold cross validation object
k_fold = RepeatedKFold(n_splits=3, n_repeats=10)

# Will test random forest and suport vector machine
forest_model_2 = RandomForestClassifier(random_state=7)
svm_model_2 = SVC(random_state=7)

# Will use accuracy as the metric is non-binary
forest_scores = cross_val_score(forest_model_2, scaled_train_set_x_balanced, train_set_y_balanced, cv=k_fold, scoring="accuracy")
svm_scores = cross_val_score(svm_model_2, scaled_train_set_x_balanced, train_set_y_balanced, cv=k_fold, scoring="accuracy")

print(f"Random Forest, mean: {forest_scores.mean()}, std: {forest_scores.std()}")
print(f"SVM: mean, {svm_scores.mean()}, std: {svm_scores.std()}")


# Build model with balanced data
forest_model_2.fit(scaled_train_set_x_balanced, train_set_y_balanced)

Random Forest, mean: 0.8563314711359403, std: 0.009170731342453437
SVM: mean, 0.708255741775295, std: 0.011358725680439921


### 9. Test balanced model performance

In [33]:
# Use test set to evaluate the balanced model
score = forest_model_2.score(test_set_x, test_set_y)

# Print score
print(f"Model score: {score}")

Model score: 0.36875


### 10. Conclusion

However, SMOTE can sometimes lead to worse performance in some cases because:
It may generate synthetic examples that do not reflect the true distribution of the data, especially when there is a complex or noisy class distribution.
The performance drop could be due to the fact that the classifier might overfit on the generated data and fail to generalize well on the test data.
