# Day 10: Capstone Project - Wine Quality Prediction


### Project Overview
We'll build a complete ML pipeline to predict wine quality based on chemical properties.

### The ML Pipeline:
1. Data Loading & Understanding
2. Exploratory Data Analysis (EDA)
3. Data Preprocessing
4. Model Training & Comparison
5. Hyperparameter Tuning
6. Final Evaluation & Insights

In [None]:
# Import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, classification_report)

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)

print(" CAPSTONE PROJECT: WINE QUALITY PREDICTION")
print("="*50)

---
## Step 1: Data Loading & Understanding

In [None]:
# Create Wine Quality Dataset
np.random.seed(42)
n = 1500

# Generate features based on wine chemistry
fixed_acidity = np.random.normal(8.0, 1.5, n).clip(4, 15)
volatile_acidity = np.random.normal(0.5, 0.2, n).clip(0.1, 1.5)
citric_acid = np.random.normal(0.3, 0.15, n).clip(0, 1)
residual_sugar = np.random.exponential(2.5, n).clip(0.5, 15)
chlorides = np.random.normal(0.08, 0.03, n).clip(0.01, 0.3)
free_sulfur = np.random.normal(15, 10, n).clip(1, 70)
total_sulfur = free_sulfur + np.random.normal(30, 20, n).clip(5, 200)
density = np.random.normal(0.996, 0.002, n).clip(0.99, 1.01)
pH = np.random.normal(3.3, 0.2, n).clip(2.8, 4.0)
sulphates = np.random.normal(0.6, 0.2, n).clip(0.2, 1.5)
alcohol = np.random.normal(10.5, 1.5, n).clip(8, 15)

# Quality based on features (3-9 scale, simplified to 3 classes)
quality_score = (
    -volatile_acidity * 2 +
    citric_acid * 1.5 +
    alcohol * 0.3 +
    sulphates * 1 -
    chlorides * 5 +
    np.random.normal(0, 0.5, n)
)
quality = pd.cut(quality_score, bins=3, labels=['Low', 'Medium', 'High'])

wine = pd.DataFrame({
    'fixed_acidity': fixed_acidity.round(2),
    'volatile_acidity': volatile_acidity.round(3),
    'citric_acid': citric_acid.round(3),
    'residual_sugar': residual_sugar.round(2),
    'chlorides': chlorides.round(4),
    'free_sulfur_dioxide': free_sulfur.round(1),
    'total_sulfur_dioxide': total_sulfur.round(1),
    'density': density.round(5),
    'pH': pH.round(2),
    'sulphates': sulphates.round(3),
    'alcohol': alcohol.round(2),
    'quality': quality
})

print(f"Dataset Shape: {wine.shape}")
print(f"\nFirst 5 rows:")
wine.head()

In [None]:
# Basic Statistics
print(" DATASET OVERVIEW")
print("="*50)
print(f"\nSamples: {len(wine)}")
print(f"Features: {len(wine.columns) - 1}")
print(f"\nTarget Distribution:")
print(wine['quality'].value_counts())
print(f"\nMissing Values: {wine.isnull().sum().sum()}")
print(f"\nData Types:\n{wine.dtypes}")

In [None]:
# Statistical Summary
wine.describe().round(3)

---
## Step 2: Exploratory Data Analysis (EDA)

In [None]:
# Target Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
wine['quality'].value_counts().plot(kind='bar', ax=axes[0], color=['#e74c3c', '#f39c12', '#2ecc71'])
axes[0].set_title('Wine Quality Distribution', fontweight='bold')
axes[0].set_xlabel('Quality')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
wine['quality'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%',
                                     colors=['#e74c3c', '#f39c12', '#2ecc71'])
axes[1].set_title('Quality Percentage', fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Feature Distributions
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
features = wine.columns[:-1]

for i, (ax, feature) in enumerate(zip(axes.flatten(), features)):
    wine[feature].hist(ax=ax, bins=30, color='#3498db', edgecolor='white', alpha=0.7)
    ax.set_title(feature, fontsize=10)
    ax.set_xlabel('')

# Hide empty subplot
axes[2, 3].set_visible(False)

plt.suptitle('Feature Distributions', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))

# Encode quality for correlation
wine_corr = wine.copy()
wine_corr['quality_encoded'] = LabelEncoder().fit_transform(wine['quality'])
corr = wine_corr.drop('quality', axis=1).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdYlBu_r', 
            center=0, square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Key Features by Quality
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
key_features = ['alcohol', 'volatile_acidity', 'sulphates', 'citric_acid', 'chlorides', 'pH']

for ax, feature in zip(axes.flatten(), key_features):
    wine.boxplot(column=feature, by='quality', ax=ax)
    ax.set_title(feature, fontweight='bold')
    ax.set_xlabel('Quality')

plt.suptitle('Key Features by Quality Level', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

---
## Step 3: Data Preprocessing

In [None]:
# Prepare features and target
X = wine.drop('quality', axis=1)
y = LabelEncoder().fit_transform(wine['quality'])  # Low=0, Medium=2, High=1

print(f"Features shape: {X.shape}")
print(f"Target classes: {np.unique(y)}")
print(f"Class distribution: {np.bincount(y)}")

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled!")
print(f"Mean (before): {X_train.mean().mean():.3f}")
print(f"Mean (after): {X_train_scaled.mean():.6f}")
print(f"Std (after): {X_train_scaled.std():.3f}")

---
## Step 4: Model Training & Comparison

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

print(" MODEL TRAINING & CROSS-VALIDATION")
print("="*55)

In [None]:
# Train and evaluate all models
results = []

for name, model in models.items():
    # Use scaled data for distance-based models
    if name in ['Logistic Regression', 'SVM']:
        X_tr, X_te = X_train_scaled, X_test_scaled
    else:
        X_tr, X_te = X_train, X_test
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_tr, y_train, cv=5, scoring='accuracy')
    
    # Train and predict
    model.fit(X_tr, y_train)
    y_pred = model.predict(X_te)
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results.append({
        'Model': name,
        'CV_Mean': cv_scores.mean(),
        'CV_Std': cv_scores.std(),
        'Test_Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1
    })
    
    print(f"\n{name}:")
    print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")
    print(f"  Test Accuracy: {acc:.4f}")

results_df = pd.DataFrame(results).sort_values('Test_Accuracy', ascending=False)
print("\n" + "="*55)

In [None]:
# Results Comparison
print(" MODEL COMPARISON")
print(results_df[['Model', 'CV_Mean', 'Test_Accuracy', 'F1']].to_string(index=False))

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(results_df)))
bars = axes[0].barh(results_df['Model'], results_df['Test_Accuracy'], color=colors)
axes[0].set_xlabel('Accuracy')
axes[0].set_title('Model Accuracy Comparison', fontweight='bold')
axes[0].set_xlim(0.5, 1.0)
for bar, acc in zip(bars, results_df['Test_Accuracy']):
    axes[0].text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                 f'{acc:.3f}', va='center')

# All metrics
metrics_df = results_df.set_index('Model')[['Test_Accuracy', 'Precision', 'Recall', 'F1']]
metrics_df.plot(kind='bar', ax=axes[1], width=0.8)
axes[1].set_title('All Metrics by Model', fontweight='bold')
axes[1].set_ylabel('Score')
axes[1].set_ylim(0.5, 1.0)
axes[1].legend(loc='lower right')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

---
## Step 5: Hyperparameter Tuning (Best Model)

In [None]:
# Tune Random Forest (typically best performer)
print(" HYPERPARAMETER TUNING: Random Forest")
print("="*50)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

In [None]:
# Compare tuned vs default
default_rf = models['Random Forest']
tuned_rf = grid_search.best_estimator_

y_pred_default = default_rf.predict(X_test)
y_pred_tuned = tuned_rf.predict(X_test)

print(" DEFAULT vs TUNED RANDOM FOREST")
print("="*50)
print(f"\n{'Metric':<15} {'Default':>12} {'Tuned':>12} {'Improvement':>12}")
print("-"*52)

for metric_name, metric_fn in [('Accuracy', accuracy_score), 
                                ('F1 Score', lambda y,p: f1_score(y,p,average='weighted'))]:
    default_score = metric_fn(y_test, y_pred_default)
    tuned_score = metric_fn(y_test, y_pred_tuned)
    improvement = tuned_score - default_score
    print(f"{metric_name:<15} {default_score:>12.4f} {tuned_score:>12.4f} {improvement:>+12.4f}")

---
## Step 6: Final Evaluation & Insights

In [None]:
# Final model evaluation
best_model = tuned_rf
y_pred_final = best_model.predict(X_test)

# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['High', 'Low', 'Medium'], yticklabels=['High', 'Low', 'Medium'])
axes[0].set_title('Confusion Matrix (Tuned RF)', fontweight='bold')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Feature Importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=True)

colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(importance)))
axes[1].barh(importance['Feature'], importance['Importance'], color=colors)
axes[1].set_title('Feature Importance', fontweight='bold')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

In [None]:
# Classification Report
print(" FINAL CLASSIFICATION REPORT")
print("="*50)
print(classification_report(y_test, y_pred_final, target_names=['High', 'Low', 'Medium']))

In [None]:
# Business Insights
print("\n" + "="*60)
print(" KEY INSIGHTS & RECOMMENDATIONS")
print("="*60)

top_features = importance.tail(5)['Feature'].tolist()[::-1]

print(f"""
 MODEL PERFORMANCE:
   Best Model: Tuned Random Forest
   Accuracy: {accuracy_score(y_test, y_pred_final):.1%}
   F1 Score: {f1_score(y_test, y_pred_final, average='weighted'):.1%}

 TOP 5 QUALITY PREDICTORS:
   1. {top_features[0]}
   2. {top_features[1]}
   3. {top_features[2]}
   4. {top_features[3]}
   5. {top_features[4]}

 BUSINESS RECOMMENDATIONS:
   - Focus quality control on volatile acidity levels
   - Higher alcohol content correlates with better quality
   - Sulphate levels significantly impact wine quality
   - Use this model for automated quality assessment
""")

In [None]:
# Predict on new wine samples
print(" PREDICT NEW WINE SAMPLES")
print("="*50)

new_wines = pd.DataFrame({
    'fixed_acidity': [7.5, 8.5, 6.8],
    'volatile_acidity': [0.3, 0.7, 0.4],
    'citric_acid': [0.4, 0.2, 0.35],
    'residual_sugar': [2.0, 3.5, 1.8],
    'chlorides': [0.07, 0.09, 0.065],
    'free_sulfur_dioxide': [15, 12, 18],
    'total_sulfur_dioxide': [45, 60, 40],
    'density': [0.995, 0.997, 0.994],
    'pH': [3.3, 3.4, 3.25],
    'sulphates': [0.7, 0.5, 0.8],
    'alcohol': [11.5, 9.5, 12.0]
})

predictions = best_model.predict(new_wines)
quality_map = {0: 'High', 1: 'Low', 2: 'Medium'}

for i, pred in enumerate(predictions):
    quality = quality_map[pred]
    print(f"\nWine {i+1}: {quality} Quality")
    print(f"  Alcohol: {new_wines.iloc[i]['alcohol']}%")
    print(f"  Volatile Acidity: {new_wines.iloc[i]['volatile_acidity']}")

In [None]:
# Final Summary
print("\n" + "="*70)
print(" CONGRATULATIONS! YOU'VE COMPLETED THE 10-DAY ML JOURNEY!")
print("="*70)
print("""
 WHAT YOU'VE LEARNED:

 Day 1-2: Python Fundamentals & Data Structures
 Day 3: NumPy - Numerical Computing
 Day 4: Pandas - Data Manipulation
 Day 5: Data Visualization (Matplotlib & Seaborn)
 Day 6: Linear Regression
 Day 7: Classification (Logistic, Trees, Random Forest)
 Day 8: Unsupervised Learning (K-Means, PCA)
 Day 9: Model Evaluation & Hyperparameter Tuning
 Day 10: Complete ML Pipeline (This Capstone!)

 YOUR ML TOOLKIT:
   - Data Loading & Cleaning (Pandas)
   - EDA & Visualization (Matplotlib, Seaborn)
   - Preprocessing (Scaling, Encoding)
   - Model Training (Scikit-Learn)
   - Evaluation (Metrics, Cross-Validation)
   - Optimization (GridSearchCV)

 NEXT STEPS:
   1. Practice with Kaggle competitions
   2. Learn Deep Learning (TensorFlow/PyTorch)
   3. Explore NLP and Computer Vision
   4. Build real-world projects
   5. Never stop learning!
""")
print("="*70)
print(" Happy Machine Learning! ")
print("="*70)