# Wine Quality Dataset - Data Preprocessing
# ========================================

This notebook handles data cleaning, feature engineering, and preprocessing for the wine quality prediction system.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Import custom processor
from data_processor import WineQualityProcessor

print("Wine Quality Dataset - Data Preprocessing")
print("=" * 50)

## 1. Load Raw Data

In [None]:
# Initialize processor
processor = WineQualityProcessor(random_state=42)

# Load raw data
raw_data = processor.load_data()

print(f"Raw data shape: {raw_data.shape}")
print(f"Features: {list(raw_data.columns)}")
print(f"\nFirst 5 rows:")
display(raw_data.head())

## 2. Data Quality Assessment

In [None]:
# Check data quality
print("DATA QUALITY ASSESSMENT")
print("=" * 30)

# Missing values
print("\nMissing Values:")
missing_values = raw_data.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found âœ“")

# Duplicates
duplicates = raw_data.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")

# Data types
print("\nData types:")
print(raw_data.dtypes)

# Basic statistics
print("\nBasic statistics:")
display(raw_data.describe())

## 3. Outlier Detection and Analysis

In [None]:
# Function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

# Analyze outliers for numerical columns
numerical_cols = raw_data.select_dtypes(include=[np.number]).columns
numerical_cols = [col for col in numerical_cols if col != 'quality']

print("OUTLIER ANALYSIS (IQR Method)")
print("=" * 30)

outlier_info = []
for col in numerical_cols:
    outlier_count, lower, upper = detect_outliers(raw_data, col)
    outlier_percentage = (outlier_count / len(raw_data)) * 100
    outlier_info.append({
        'Feature': col,
        'Outliers': outlier_count,
        'Percentage': f"{outlier_percentage:.1f}%",
        'Lower_Bound': f"{lower:.3f}",
        'Upper_Bound': f"{upper:.3f}"
    })
    
outlier_df = pd.DataFrame(outlier_info)
display(outlier_df)

In [None]:
# Visualize outliers
fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        raw_data.boxplot(column=col, ax=axes[i])
        axes[i].set_title(f'{col} - Outliers')
        axes[i].tick_params(axis='x', rotation=45)

# Remove empty subplots
for j in range(i+1, len(axes)):
    axes[j].remove()

plt.tight_layout()
plt.show()

## 4. Data Cleaning and Preprocessing

In [None]:
# Clean the data
cleaned_data = processor.clean_data(raw_data)

print(f"Data shape after cleaning: {cleaned_data.shape}")
print(f"Rows removed: {len(raw_data) - len(cleaned_data)}")

# Show cleaning results
print("\nData cleaning completed:")
print(f"âœ“ Missing values handled")
print(f"âœ“ Duplicates removed")
print(f"âœ“ Data types verified")

## 5. Feature Engineering

In [None]:
# Create additional features
featured_data = cleaned_data.copy()

# Quality categories for analysis
featured_data['quality_category'] = pd.cut(
    featured_data['quality'], 
    bins=[0, 4, 6, 10], 
    labels=['low', 'medium', 'high'],
    include_lowest=True
)

# Alcohol strength categories
featured_data['alcohol_strength'] = pd.cut(
    featured_data['alcohol'],
    bins=[0, 10, 12, 20],
    labels=['low', 'medium', 'high'],
    include_lowest=True
)

# Acidity ratio
featured_data['acidity_ratio'] = (
    featured_data['fixed acidity'] / (featured_data['volatile acidity'] + 0.001)
)

# Sulfur dioxide ratio
featured_data['sulfur_ratio'] = (
    featured_data['free sulfur dioxide'] / (featured_data['total sulfur dioxide'] + 0.001)
)

print("Feature Engineering Results:")
print(f"Original features: {len(cleaned_data.columns)}")
print(f"Total features after engineering: {len(featured_data.columns)}")
print(f"New features created: {len(featured_data.columns) - len(cleaned_data.columns)}")

print("\nNew features:")
new_features = ['quality_category', 'alcohol_strength', 'acidity_ratio', 'sulfur_ratio']
for feature in new_features:
    print(f"âœ“ {feature}")

display(featured_data[new_features].head())

## 6. Feature Scaling and Encoding

In [None]:
# Separate features and target
target = featured_data['quality'].copy()
exclude_cols = ['quality', 'quality_category', 'alcohol_strength']  # Exclude categorical analysis features
features = featured_data.drop(columns=exclude_cols)

print(f"Features for modeling: {list(features.columns)}")
print(f"Target variable: quality")

# Handle categorical variables (wine_type)
categorical_cols = features.select_dtypes(include=['object', 'category']).columns
print(f"\nCategorical features: {list(categorical_cols)}")

if len(categorical_cols) > 0:
    label_encoder = LabelEncoder()
    for col in categorical_cols:
        features[col] = label_encoder.fit_transform(features[col])
        print(f"âœ“ Encoded {col}: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# Display feature statistics before scaling
print("\nFeature statistics before scaling:")
display(features.describe())

## 7. Train-Test Split

In [None]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(
    features, target, test_size=0.4, random_state=42, stratify=target
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Data Split Results:")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(features)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(features)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(features)*100:.1f}%)")
print(f"Total: {len(features)} samples")

# Check target distribution in splits
print("\nTarget distribution:")
print("Training set:")
print(y_train.value_counts().sort_index())
print("\nValidation set:")
print(y_val.value_counts().sort_index())
print("\nTest set:")
print(y_test.value_counts().sort_index())

## 8. Feature Scaling

In [None]:
# Scale features
scaler = StandardScaler()

# Fit on training data and transform all sets
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_val_scaled = pd.DataFrame(
    scaler.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

print("Feature Scaling Results:")
print(f"âœ“ Training set scaled: {X_train_scaled.shape}")
print(f"âœ“ Validation set scaled: {X_val_scaled.shape}")
print(f"âœ“ Test set scaled: {X_test_scaled.shape}")

print("\nScaled feature statistics (training set):")
display(X_train_scaled.describe())

## 9. Final Dataset Summary

In [None]:
print("FINAL PREPROCESSING SUMMARY")
print("=" * 40)

print(f"âœ“ Data loaded: {raw_data.shape[0]} samples")
print(f"âœ“ Data cleaned: {len(raw_data) - len(cleaned_data)} rows removed")
print(f"âœ“ Features engineered: {len(featured_data.columns) - len(cleaned_data.columns)} new features")
print(f"âœ“ Categorical variables encoded: {len(categorical_cols)} features")
print(f"âœ“ Data split: {len(X_train)}/{len(X_val)}/{len(X_test)} (train/val/test)")
print(f"âœ“ Features scaled: StandardScaler applied")

print(f"\nFinal dataset characteristics:")
print(f"â€¢ Features: {X_train_scaled.shape[1]}")
print(f"â€¢ Samples: {len(features)}")
print(f"â€¢ Target classes: {sorted(target.unique())}")
print(f"â€¢ Feature names: {list(X_train_scaled.columns)}")

print(f"\nðŸŽ¯ Data is ready for model training!")

## 10. Save Processed Data (Optional)

In [None]:
# Save processed data for model training
import os

# Create data directory
os.makedirs('data/processed', exist_ok=True)

# Save processed datasets
X_train_scaled.to_csv('data/processed/X_train.csv', index=False)
X_val_scaled.to_csv('data/processed/X_val.csv', index=False)
X_test_scaled.to_csv('data/processed/X_test.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
y_val.to_csv('data/processed/y_val.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)

print("âœ“ Processed datasets saved to 'data/processed/' directory")
print("âœ“ Ready for model training phase")