# Wine Quality Dataset - Exploratory Data Analysis
# ===============================================

This notebook explores the wine quality dataset to understand the data structure, distributions, and relationships between features.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Wine Quality Dataset - Exploratory Data Analysis")
print("=" * 50)

## 1. Data Loading

In [None]:
# Load the wine quality datasets
try:
    # Download from UCI repository
    red_wine_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    white_wine_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
    
    red_wine = pd.read_csv(red_wine_url, sep=';')
    white_wine = pd.read_csv(white_wine_url, sep=';')
    
    print(f"✓ Red wine data loaded: {red_wine.shape}")
    print(f"✓ White wine data loaded: {white_wine.shape}")
    
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure you have internet connection")

# Add wine type column
red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'

# Combine datasets
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)
print(f"✓ Combined dataset shape: {wine_data.shape}")

## 2. Basic Dataset Information

In [None]:
# Display basic information
print("Dataset Info:")
print(wine_data.info())

print("\nFirst 5 rows:")
display(wine_data.head())

print("\nBasic Statistics:")
display(wine_data.describe())

In [None]:
# Check for missing values
print("Missing Values:")
print(wine_data.isnull().sum())

print(f"\nUnique Quality Scores: {sorted(wine_data['quality'].unique())}")
print(f"Quality Score Distribution:")
print(wine_data['quality'].value_counts().sort_index())

## 3. Data Visualization

In [None]:
# Quality distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Overall quality distribution
wine_data['quality'].hist(bins=20, edgecolor='black', alpha=0.7, ax=axes[0,0])
axes[0,0].set_title('Distribution of Wine Quality Scores')
axes[0,0].set_xlabel('Quality Score')
axes[0,0].set_ylabel('Frequency')

# Quality by wine type
wine_data.boxplot(column='quality', by='wine_type', ax=axes[0,1])
axes[0,1].set_title('Quality Distribution by Wine Type')

# Alcohol vs Quality
axes[1,0].scatter(wine_data['alcohol'], wine_data['quality'], alpha=0.5)
axes[1,0].set_xlabel('Alcohol Content')
axes[1,0].set_ylabel('Quality')
axes[1,0].set_title('Alcohol vs Quality')

# pH vs Quality
axes[1,1].scatter(wine_data['pH'], wine_data['quality'], alpha=0.5)
axes[1,1].set_xlabel('pH')
axes[1,1].set_ylabel('Quality')
axes[1,1].set_title('pH vs Quality')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
numerical_cols = wine_data.select_dtypes(include=[np.number]).columns
correlation_matrix = wine_data[numerical_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.show()

# Features most correlated with quality
quality_corr = correlation_matrix['quality'].abs().sort_values(ascending=False)
print(f"\nFeatures most correlated with quality:")
print(quality_corr.head(10))

## 4. Key Insights

In [None]:
# Summary statistics
total_samples = len(wine_data)
red_samples = len(wine_data[wine_data['wine_type'] == 'red'])
white_samples = len(wine_data[wine_data['wine_type'] == 'white'])

print("KEY INSIGHTS")
print("=" * 50)
print(f"Dataset Summary:")
print(f"  • Total samples: {total_samples:,}")
print(f"  • Red wine samples: {red_samples:,} ({red_samples/total_samples*100:.1f}%)")
print(f"  • White wine samples: {white_samples:,} ({white_samples/total_samples*100:.1f}%)")
print(f"  • Quality range: {wine_data['quality'].min()}-{wine_data['quality'].max()}")
print(f"  • Average quality: {wine_data['quality'].mean():.2f}")

print(f"\nTop Quality Predictors:")
for i, (feature, corr) in enumerate(quality_corr.head(6).items()):
    if feature != 'quality':
        print(f"  {i}. {feature}: {corr:.3f}")

print(f"\nRecommendations for Modeling:")
print(f"  • Target variable: Wine quality (3-9 scale)")
print(f"  • Consider both regression and classification approaches")
print(f"  • Feature scaling recommended due to different ranges")
print(f"  • Wine type is important categorical feature")
print(f"  • Strong correlations suggest good predictive potential")