In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

In [2]:
red_wine_original = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
white_wine_original = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')


In [3]:
white_wine=white_wine_original.copy()
red_wine=red_wine_original.copy()

# we are creating a new column called "quality_label", we define a range and associate that range with a label
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

# we are creating a new column called "quality_label", we define a range and associate that range with a label
white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

# here we are transforming these labels into categrical data type (specific to pandas) instead of simple string
white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine_to_concat=white_wine.copy()
white_wine_to_concat['color']='white'
red_wine_to_concat=red_wine.copy()
red_wine_to_concat['color']='red'

frames = [red_wine_to_concat, white_wine_to_concat]


wines = pd.concat(frames)


In [4]:


F, p = stats.f_oneway(red_wine[red_wine['quality_label'] == 'low']['alcohol'],
red_wine[red_wine['quality_label'] == 'medium']['alcohol'],
red_wine[red_wine['quality_label'] == 'high']['alcohol'])
print('ANOVA test for mean alcohol levels across wine samples with different quality ratings')
print('F Statistic:', F, '\tp-value:', p)

ANOVA test for mean alcohol levels across wine samples with different quality ratings
F Statistic: 205.23448800361507 	p-value: 4.776936653588672e-80


# Anova Test

   * F-test score: It calculates the variation between sample group means divided by variation within the sample group.
   * P-value: It shows us the confidence degree. In other words, it tells us whether the obtained result is statistically significant or not.

A p-value above 0.05 (as we chose 95% as confidence level) indicates that no difference can be found. But, if the value is below, there is a statistically significant difference 

## White wine

In [5]:
def anova_quality(df):
    for col in df:
        if col == 'quality_label':
            continue
        else:
            print(col.upper())
            F, p = stats.f_oneway(df[df['quality_label'] == 'low'][col],df[df['quality_label'] == 'medium'][col],df[df['quality_label'] == 'high'][col])
            print(f'ANOVA test for mean {col} levels across wine samples with different quality ratings')
            print('F Statistic:', F, '\tp-value:', round(p,4), '\n')
        
anova_quality(white_wine)

FIXED ACIDITY
ANOVA test for mean fixed acidity levels across wine samples with different quality ratings
F Statistic: 21.923026494425756 	p-value: 0.0 

VOLATILE ACIDITY
ANOVA test for mean volatile acidity levels across wine samples with different quality ratings
F Statistic: 133.6668084396585 	p-value: 0.0 

CITRIC ACID
ANOVA test for mean citric acid levels across wine samples with different quality ratings
F Statistic: 0.23254245986965996 	p-value: 0.7925 

RESIDUAL SUGAR
ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 21.934773835107606 	p-value: 0.0 

CHLORIDES
ANOVA test for mean chlorides levels across wine samples with different quality ratings
F Statistic: 90.87064418602546 	p-value: 0.0 

FREE SULFUR DIOXIDE
ANOVA test for mean free sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 0.5909833608324854 	p-value: 0.5538 

TOTAL SULFUR DIOXIDE
ANOVA test for mean total sulfur dioxide l

## Red Wine

In [6]:
anova_quality(red_wine)

FIXED ACIDITY
ANOVA test for mean fixed acidity levels across wine samples with different quality ratings
F Statistic: 7.30844888128248 	p-value: 0.0007 

VOLATILE ACIDITY
ANOVA test for mean volatile acidity levels across wine samples with different quality ratings
F Statistic: 92.87437900766831 	p-value: 0.0 

CITRIC ACID
ANOVA test for mean citric acid levels across wine samples with different quality ratings
F Statistic: 22.85735545283018 	p-value: 0.0 

RESIDUAL SUGAR
ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 0.011800296751587022 	p-value: 0.9883 

CHLORIDES
ANOVA test for mean chlorides levels across wine samples with different quality ratings
F Statistic: 10.542381423023661 	p-value: 0.0 

FREE SULFUR DIOXIDE
ANOVA test for mean free sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 3.3917571885448745 	p-value: 0.0339 

TOTAL SULFUR DIOXIDE
ANOVA test for mean total sulfur dioxide

### Statistically important values

In [11]:
def anova_quality_significant(df):
    for col in df:
        if col == 'quality_label':
            continue
        else:
         
            F, p = stats.f_oneway(df[df['quality_label'] == 'low'][col],df[df['quality_label'] == 'medium'][col],df[df['quality_label'] == 'high'][col])
            if round(p,4) < 0.05: 
                print(col.upper())
                print(f'ANOVA test for mean {col} levels across wine samples with different quality ratings')
                print('F Statistic:', F, '\tp-value:', round(p,4), '\n')
                
def anova_quality_not_significant(df):
    for col in df:
        if col == 'quality_label':
            continue
        else:
         
            F, p = stats.f_oneway(df[df['quality_label'] == 'low'][col],df[df['quality_label'] == 'medium'][col],df[df['quality_label'] == 'high'][col])
            if round(p,4) > 0.05: 
                print(col.upper())
                print(f'ANOVA test for mean {col} levels across wine samples with different quality ratings')
                print('F Statistic:', F, '\tp-value:', round(p,4), '\n')
                
if 'color' in wines:
    wine_mx=wines.drop(columns=[ 'color'])
else: 
    wine_mx=wines.copy()
    

anova_quality_significant(wine_mx)


FIXED ACIDITY
ANOVA test for mean fixed acidity levels across wine samples with different quality ratings
F Statistic: 20.38163849837651 	p-value: 0.0 

VOLATILE ACIDITY
ANOVA test for mean volatile acidity levels across wine samples with different quality ratings
F Statistic: 250.33914475322013 	p-value: 0.0 

CITRIC ACID
ANOVA test for mean citric acid levels across wine samples with different quality ratings
F Statistic: 18.968901422737094 	p-value: 0.0 

RESIDUAL SUGAR
ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 3.432938987382213 	p-value: 0.0324 

CHLORIDES
ANOVA test for mean chlorides levels across wine samples with different quality ratings
F Statistic: 120.96013685200977 	p-value: 0.0 

FREE SULFUR DIOXIDE
ANOVA test for mean free sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 10.312540552928873 	p-value: 0.0 

TOTAL SULFUR DIOXIDE
ANOVA test for mean total sulfur dioxide level

## White

In [8]:
anova_quality_significant(white_wine)

FIXED ACIDITY
ANOVA test for mean fixed acidity levels across wine samples with different quality ratings
F Statistic: 21.923026494425756 	p-value: 0.0 

VOLATILE ACIDITY
ANOVA test for mean volatile acidity levels across wine samples with different quality ratings
F Statistic: 133.6668084396585 	p-value: 0.0 

RESIDUAL SUGAR
ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 21.934773835107606 	p-value: 0.0 

CHLORIDES
ANOVA test for mean chlorides levels across wine samples with different quality ratings
F Statistic: 90.87064418602546 	p-value: 0.0 

TOTAL SULFUR DIOXIDE
ANOVA test for mean total sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 76.66028143448855 	p-value: 0.0 

DENSITY
ANOVA test for mean density levels across wine samples with different quality ratings
F Statistic: 209.69457693191376 	p-value: 0.0 

PH
ANOVA test for mean pH levels across wine samples with different quality r

In [12]:
anova_quality_not_significant(white_wine)

CITRIC ACID
ANOVA test for mean citric acid levels across wine samples with different quality ratings
F Statistic: 0.23254245986965996 	p-value: 0.7925 

FREE SULFUR DIOXIDE
ANOVA test for mean free sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 0.5909833608324854 	p-value: 0.5538 



## Red

In [9]:
anova_quality_significant(red_wine)

FIXED ACIDITY
ANOVA test for mean fixed acidity levels across wine samples with different quality ratings
F Statistic: 7.30844888128248 	p-value: 0.0007 

VOLATILE ACIDITY
ANOVA test for mean volatile acidity levels across wine samples with different quality ratings
F Statistic: 92.87437900766831 	p-value: 0.0 

CITRIC ACID
ANOVA test for mean citric acid levels across wine samples with different quality ratings
F Statistic: 22.85735545283018 	p-value: 0.0 

CHLORIDES
ANOVA test for mean chlorides levels across wine samples with different quality ratings
F Statistic: 10.542381423023661 	p-value: 0.0 

FREE SULFUR DIOXIDE
ANOVA test for mean free sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 3.3917571885448745 	p-value: 0.0339 

TOTAL SULFUR DIOXIDE
ANOVA test for mean total sulfur dioxide levels across wine samples with different quality ratings
F Statistic: 45.71048039843774 	p-value: 0.0 

DENSITY
ANOVA test for mean density levels across wine 

In [13]:
anova_quality_not_significant(red_wine)


RESIDUAL SUGAR
ANOVA test for mean residual sugar levels across wine samples with different quality ratings
F Statistic: 0.011800296751587022 	p-value: 0.9883 

PH
ANOVA test for mean pH levels across wine samples with different quality ratings
F Statistic: 0.7354879827479522 	p-value: 0.4794 

