In [29]:
import pandas as pd 
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/kaggle/input/wine-quality-dataset/winequality-dataset_updated.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.3,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.2,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0,1999.0
mean,8.670335,0.541773,0.246668,3.69909,0.075858,20.191096,52.617809,0.996477,3.29014,0.949465,10.671161,5.637819
std,2.240023,0.180381,0.181348,3.290201,0.048373,15.642224,37.051121,0.00211,0.274297,0.780523,1.369932,1.255574
min,4.6,0.12,0.0,0.9,0.01,1.0,6.0,0.99007,2.34,0.33,8.4,2.0
25%,7.1,0.4,0.11,2.0,0.056,9.0,24.0,0.995265,3.18,0.56,9.5,5.0
50%,8.0,0.53,0.2,2.3,0.075,16.0,42.0,0.9966,3.3,0.65,10.4,6.0
75%,9.9,0.66,0.385,3.46,0.086,27.0,73.0,0.9978,3.42,0.84,11.4,6.0
max,15.9,1.58,1.0,15.99,0.611,72.0,289.0,1.00369,4.16,3.99,15.0,9.0


In [4]:
df.nunique()

fixed acidity            96
volatile acidity        144
citric acid              80
residual sugar          424
chlorides               180
free sulfur dioxide      75
total sulfur dioxide    154
density                 605
pH                      172
sulphates               271
alcohol                  79
quality                   8
dtype: int64

In [5]:
print(df.isnull().values.any())
print(df.isna().sum())

False
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [67]:
quality_unique = df['quality'].unique()
print(sorted(quality_unique))

[2, 3, 4, 5, 6, 7, 8, 9]


In [7]:
X, y = df.drop(columns='quality', axis=1), df['quality']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
df.select_dtypes(include=['object']).sum()

Series([], dtype: float64)

## Model Building 

In [20]:
def compute_predictions(y_true, y_pred, y_proba):
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1score = f1_score(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)     
    roc_auc = roc_auc_score(y_true, y_proba, multi_class='ovr')
    
    return precision, recall, f1score, acc, roc_auc

### Logistic Regression

In [80]:
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', LogisticRegression())  
])
pipeline.fit(X_train, y_train)
y_proba = pipeline.predict_proba(X_test)
print(y_proba[0], y_test.iloc[0])

[0.0011175  0.00261432 0.01605915 0.38983117 0.46933545 0.11319504
 0.00591245 0.00193491] 5


### Random Forest

In [89]:
# le = LabelEncoder()
# le.fit(y_train)

# y_train_enc = le.transform(y_train)
# y_test_enc = le.transform(y_test)

pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
proba_scores = pipeline.predict_proba(X_test)

In [90]:
precision, recall, f1score, acc, roc_auc = compute_predictions(y_test, preds, proba_scores)
print(precision, recall, f1score, acc, roc_auc)

0.5024400430885722 0.535 0.5145914769614209 0.535 0.8294296540034414


In [22]:
confusion = confusion_matrix(y_test, preds)
confusion

array([[  0,   2,   1,   2,   1,   7,   0,   1],
       [  0,   1,   0,   6,   1,   6,   0,   2],
       [  2,   1,   0,  13,   4,   7,   2,   3],
       [  1,   1,   1, 151,  57,  11,   0,   1],
       [  1,   3,   1,  53, 133,  17,   2,   0],
       [  2,   1,   2,   3,  32,  25,   2,   3],
       [  1,   0,   0,   1,   4,   7,   1,   3],
       [  1,   2,   0,   3,   0,   7,   1,   4]])

### Gradient Boosting Classifier

In [26]:
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', GradientBoostingClassifier(n_estimators=100))
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
proba_scores = pipeline.predict_proba(X_test)

In [27]:
precision, recall, f1score, acc, roc_auc = compute_predictions(y_test, preds, proba_scores)
print(precision, recall, f1score, acc, roc_auc)

0.5065057618150839 0.5233333333333333 0.513046347491549 0.5233333333333333 0.8111180238135981


### XGBoosting 

In [36]:
from xgboost import XGBClassifier

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('model', XGBClassifier(n_estimators=100))
])

pipeline.fit(X_train, y_train_enc)
preds = pipeline.predict(X_test)
proba_scores = pipeline.predict_proba(X_test)

In [37]:
precision, recall, f1score, acc, roc_auc = compute_predictions(y_test, preds, proba_scores)
print(precision, recall, f1score, acc, roc_auc)

0.06387585897968479 0.028333333333333332 0.023765348747646152 0.028333333333333332 0.821224974631048


### NN 

In [132]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset
from torch.nn.utils import clip_grad_norm_

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

X_train_arr, X_test_arr, y_train_arr, y_test_arr = np.array(X_train_scaled), np.array(X_test_scaled), np.array(y_train_enc), np.array(y_test_enc)

X_train_tensor = torch.tensor(X_train_arr, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_arr, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_arr, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_arr, dtype=torch.long)

dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(11, 128)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(128, 64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 8)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batchnorm1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.batchnorm2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x 
    
model = NN()
floss = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.001, weight_decay=0)
num_epochs = 20 

for epoch in range(num_epochs):
    model.train()
    for x_batch, y_batch in dataloader:
        outputs = model(x_batch)
        loss = floss(outputs, y_batch)
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
    print(f'Epoch[{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    output_loss = floss(test_outputs, y_test_tensor)
    print(output_loss.item())
    _, predicted = torch.max(test_outputs, 1)
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f'Test Accuracy: {accuracy:.4f}')
    
    
    #bad accuracy. need to tweak hyperparams. 

Epoch[1/20], Loss: 1.4228
Epoch[2/20], Loss: 1.3640
Epoch[3/20], Loss: 1.0042
Epoch[4/20], Loss: 1.3438
Epoch[5/20], Loss: 1.2431
Epoch[6/20], Loss: 1.3182
Epoch[7/20], Loss: 1.1154
Epoch[8/20], Loss: 1.1283
Epoch[9/20], Loss: 1.1286
Epoch[10/20], Loss: 1.1580
Epoch[11/20], Loss: 1.1932
Epoch[12/20], Loss: 1.1108
Epoch[13/20], Loss: 0.8569
Epoch[14/20], Loss: 1.0591
Epoch[15/20], Loss: 1.0283
Epoch[16/20], Loss: 0.8475
Epoch[17/20], Loss: 0.8877
Epoch[18/20], Loss: 1.1530
Epoch[19/20], Loss: 1.1050
Epoch[20/20], Loss: 1.1535
1.264755129814148
Test Accuracy: 0.4783
