# ML Assignment — Wine Classification (Restructured)

> **Objective:** Classify wine quality into **low / medium / high** and compare models.

This notebook keeps the same template/flow as your sample (data creation → EDA → binning → split → preprocessing → models → metrics/plots) while subtly restructuring the code (Pipeline + ColumnTransformer, different model order, small hyperparameter tweaks).

## 1) Setup & Dataset Creation

In [None]:

# 1. Imports
import pandas as pd
import numpy as np

from urllib.request import urlopen
from io import StringIO

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model & preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Estimators & metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Reproducibility
RANDOM_STATE = 24


In [None]:

# 2. Download red/white wine datasets from UCI and combine
red_url   = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
white_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'

def read_uci_csv(url: str, sep=';'):
    data = urlopen(url).read().decode('utf-8')
    return pd.read_csv(StringIO(data), sep=sep)

red_df   = read_uci_csv(red_url)
white_df = read_uci_csv(white_url)

red_df['wine_type']   = 'red'
white_df['wine_type'] = 'white'

wine_df = pd.concat([red_df, white_df], ignore_index=True)
wine_df.to_csv('winequality.csv', index=False)
print('Dataset ready:', wine_df.shape)
wine_df.head()


## 2) Quick EDA

In [None]:

# Missing values
wine_df.isnull().sum()


In [None]:

# Summary statistics
wine_df.describe()


In [None]:

# Quality distribution plot
plt.figure(figsize=(10, 5))
sns.countplot(x='quality', data=wine_df, color='#4C72B0')
plt.title('Wine Quality Distribution')
plt.show()


## 3) Target Engineering (low / medium / high)

In [None]:

# Map integer quality to 3 categories: low (3-5), medium (6), high (7-9)
conditions = [
    (wine_df['quality'] <= 5),
    (wine_df['quality'] == 6),
    (wine_df['quality'] >= 7),
]
choices = ['low', 'medium', 'high']
wine_df['quality_category'] = np.select(conditions, choices, default='medium')

# Drop original quality
wine_df = wine_df.drop(columns=['quality'])
wine_df.head()


## 4) Split & Preprocessing Pipeline

In [None]:

# Features & target
X = wine_df.drop(columns=['quality_category'])
y = wine_df['quality_category']

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# Column groups
num_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
cat_cols = ['wine_type']

# Preprocessor: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='if_binary'), cat_cols),
    ],
    remainder='drop',
)


## 5) Model A — Decision Tree

In [None]:

dt_model = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf', DecisionTreeClassifier(
        criterion='gini',
        max_depth=9,
        min_samples_split=12,
        min_samples_leaf=6,
        random_state=RANDOM_STATE
    ))
])

dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print('='*55)
print('DECISION TREE — RESULTS')
print('='*55)
print(f'Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}')
print(classification_report(y_test, y_pred_dt))
cm_dt = confusion_matrix(y_test, y_pred_dt, labels=['low','medium','high'])
ConfusionMatrixDisplay(cm_dt, display_labels=['low','medium','high']).plot(cmap='Blues')
plt.title('Decision Tree — Confusion Matrix')
plt.show()


## 6) Model B — Multinomial Logistic Regression

In [None]:

log_reg_model = Pipeline(steps=[
    ('prep', preprocessor),
    ('clf', LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000,
        random_state=RANDOM_STATE
    ))
])

log_reg_model.fit(X_train, y_train)
y_pred_lr = log_reg_model.predict(X_test)

print('='*55)
print('LOGISTIC REGRESSION — RESULTS')
print('='*55)
print(f'Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}')
print(classification_report(y_test, y_pred_lr))
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=['low','medium','high'])
ConfusionMatrixDisplay(cm_lr, display_labels=['low','medium','high']).plot(cmap='Greens')
plt.title('Logistic Regression — Confusion Matrix')
plt.show()


## 7) Feature Signal (LR coefficients after transform)

In [None]:

# Inspect LR coefficients for strongest signals per class
ohe = log_reg_model.named_steps['prep'].named_transformers_['cat']
ohe_features = ohe.get_feature_names_out(['wine_type'])
feature_names = np.r_[num_cols, ohe_features]
coef = log_reg_model.named_steps['clf'].coef_
for i, cls in enumerate(['low','medium','high']):
    top_idx = np.argsort(np.abs(coef[i]))[::-1][:10]
    print(f"
Top signals for class '{cls}':")
    for j in top_idx:
        print(f"  {feature_names[j]:<25} {coef[i][j]: .4f}")


## 8) Side-by-side Accuracy

In [None]:

acc_dt = accuracy_score(y_test, y_pred_dt)
acc_lr = accuracy_score(y_test, y_pred_lr)

summary = pd.DataFrame({
    'Model': ['Decision Tree', 'Logistic Regression'],
    'Accuracy': [acc_dt, acc_lr]
}).sort_values('Accuracy', ascending=False)
summary



---
**Requirements**: `pandas`, `numpy`, `matplotlib`, `seaborn`, `scikit-learn` (latest).

You can run this notebook as-is; it downloads UCI red/white wine CSVs at runtime and builds the combined dataset.
