In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# loading the datasets
red_wine = pd.read_csv(r"C:\Users\ADMIN\Downloads\wine+quality\winequality-red.csv", sep = ';')
white_wine = pd.read_csv(r"C:\Users\ADMIN\Downloads\wine+quality\winequality-white.csv", sep = ';')

In [3]:
# renaming the columns
white_wine.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']

red_wine.columns = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']

In [4]:
# Function to generate detailed univariate summary with quantiles
def univariate_summary(df):
    summary = pd.DataFrame({
        'Mean': df.mean(),
        'Std': df.std(),
        'Min': df.min(),
        '25% (Q1)': df.quantile(0.25),
        '50% (Median)': df.median(),
        '75% (Q3)': df.quantile(0.75),
        'Max': df.max(),
        'Skewness': df.skew(),
        'Kurtosis': df.kurtosis()
    })
    return summary.round(3)  # Rounded for neatness

# Create summaries
red_summary = univariate_summary(red_wine)
white_summary = univariate_summary(white_wine)

# Display summaries
print("\n--- Red Wine Univariate Summary ---\n")
red_summary



--- Red Wine Univariate Summary ---



Unnamed: 0,Mean,Std,Min,25% (Q1),50% (Median),75% (Q3),Max,Skewness,Kurtosis
fixed_acidity,8.32,1.741,4.6,7.1,7.9,9.2,15.9,0.983,1.132
volatile_acidity,0.528,0.179,0.12,0.39,0.52,0.64,1.58,0.672,1.226
citric_acid,0.271,0.195,0.0,0.09,0.26,0.42,1.0,0.318,-0.789
residual_sugar,2.539,1.41,0.9,1.9,2.2,2.6,15.5,4.541,28.618
chlorides,0.087,0.047,0.012,0.07,0.079,0.09,0.611,5.68,41.716
free_sulfur_dioxide,15.875,10.46,1.0,7.0,14.0,21.0,72.0,1.251,2.024
total_sulfur_dioxide,46.468,32.895,6.0,22.0,38.0,62.0,289.0,1.516,3.81
density,0.997,0.002,0.99,0.996,0.997,0.998,1.004,0.071,0.934
pH,3.311,0.154,2.74,3.21,3.31,3.4,4.01,0.194,0.807
sulphates,0.658,0.17,0.33,0.55,0.62,0.73,2.0,2.429,11.72


In [None]:
# distribution plots for red wine's features
plt.figure(figsize=(10, 6))
for col in red_wine.columns:
 sns.histplot(red_wine[col], kde=True, bins=15)
 plt.suptitle(f"Distribution of {col} in Red Wine")
 plt.show()

# distribution plots for white wine's features
plt.figure(figsize=(10, 6))
for col in red_wine.columns:
 sns.histplot(white_wine[col], kde=True, bins=15)
 plt.suptitle(f"Distribution of {col} in White Wine")
 plt.show() 

In [None]:
# distribution kdeplots for white wine's features
plt.figure(figsize=(10, 6))
for col in white_wine.columns:
 sns.kdeplot(white_wine[col], fill =True, color='orange', bw_adjust=1.0)
 plt.suptitle(f"Distribution of {col} in White Wine")
 plt.show()

# distribution kdeplots for red wine's features
plt.figure(figsize=(10, 6))
for col in red_wine.columns:
 sns.kdeplot(red_wine[col], fill =True, color='orange', bw_adjust=1.0)
 plt.suptitle(f"Distribution of {col} in red Wine")
 plt.show()

In [None]:
# boxplots for red wine's features
plt.figure(figsize=(10, 6))
for col in red_wine.columns:
    sns.boxplot(y=col, data=red_wine, color='orange', orient='h', )
    plt.title(f"Boxplot of {col} in Red Wine")
    plt.show()

# boxplots for white wine's features
plt.figure(figsize=(10, 6))
for col in white_wine.columns:
    sns.boxplot(y=col, data=white_wine, color='orange', orient='h', )
    plt.title(f"Boxplot of {col} in Red Wine")
    plt.show()

In [None]:
# bar plot for red wine's quality
sns.countplot(x='quality', data= red_wine, color='orange')
plt.title('Count of Quality Ratings in Red Wine')
plt.show()

# bar plot for white wine's quality
sns.countplot(x='quality', data= white_wine, color='orange')
plt.title('Count of Quality Ratings in White Wine')
plt.show()

In [None]:
# correlation coefficients for red wine features
correlation = red_wine.corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation, annot=True, cmap= 'coolwarm', fmt = '.2f')
plt.show()

# correlation coefficients for white wine features
correlation = white_wine.corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation, annot=True, cmap= 'coolwarm', fmt = '.2f')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
target_column = 'quality'  # Change this to your target column if different

for col in white_wine.columns:
    if col != target_column:  # Skip plotting the target against itself
        sns.boxplot(y=white_wine[col], x=white_wine[target_column], color='orange')
        plt.title(f"Scatter plot of {col} vs {target_column} in White Wine")
        plt.xlabel(col)
        plt.ylabel(target_column)
        plt.show()

# Key EDA Insights:
- White wines are generally sweeter, less dense, more acidic, and preserved with more sulfur dioxide
- Red wines tend to have higher fixed acidity, higher volatile acidity, more chlorides (salt), and higher sulphates
- Both wines show some extreme outliers especially in residual sugar, chlorides, and sulphates
- Quality ratings are similar, but white wines are slightly better rated on average

# Summary Business Insights by Wine Type:
**White Wine**
- Alcohol and Citric Acid positively impact quality.
- Slight acidity (via fixed acidity) improves perception — aim for a crisp, refreshing profile.
- Residual sugar has minimal effect — slight sweetness acceptable.
- Sulphates help improve microbial stability without harming quality.

**Red Wine**
- Alcohol still boosts quality, but high volatile acidity, chlorides, and density hurt quality more significantly.
- Fixed acidity and citric acid don’t help as much — aim for smoothness and lower sharpness.
- Stronger penalty for excessive sulfur dioxide.
- Consider lowering density through better sugar fermentation.

