In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import zscore

In [2]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df

In [3]:
df.describe()

In [4]:
df.dtypes

In [5]:
df = df.drop_duplicates()
df

We're missing data about vines whose quality is: 1, 2, 9, 10. This indicates that we might have problem predicting vines from this values (for examples not from the dataset).

In [6]:
qualities = df.quality.sort_values().unique()
qualities

The data is also extremely imbalanced: we have many wines whose quality is 5-6, but very small number of wines whose quality is 3, 8.

In [7]:
sns.countplot(x="quality", data=df).set_title("Wine distribution by quality")

In [8]:
(df.quality.value_counts(normalize=True) * 100)

In [9]:
sns.set(rc={'figure.figsize':(28,15)})
df.hist()

In [10]:
sns.set(rc={'figure.figsize':(10,5)})
corr = df.corr().sort_values(ascending=False, key=lambda x: abs(x), by="quality")

print("Correlation to 'quality' value:")
print (corr["quality"])

ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

From the heatmap we can learn that high-quality vined tend to have:
1. **High** alcohol percentage
2. **Low** volatile acidity
3. **High** sulphates
4. **High** citric acid

In [11]:
sns.set(rc={'figure.figsize':(20,30)})

fig, axes = plt.subplots(4)
sns.barplot(x='quality',y='alcohol',data=df, ax=axes[0]).set_title("Alcohol percentage impact on vine quality")
sns.barplot(x='quality',y='volatile acidity',data=df, ax=axes[1]).set_title("Volatile acidity value impact on vine quality")
sns.barplot(x='quality',y='sulphates',data=df, ax=axes[2]).set_title("Sulphates value impact on vine quality")
sns.barplot(x='quality',y='citric acid',data=df, ax=axes[3]).set_title("Citric acid value impact on vine quality")

We should also consider removing the columns that have low correlation with the quality column:
1. pH
2. free sulfur dioxide
3. residual sugar

In [12]:
df = df.drop(columns=["pH", "free sulfur dioxide", "residual sugar"])
df

Because most of the columns have normal destribution, and there might be linear relationship between the values and the quality of the wine - logistic regression should be a probable choice for this classification problem. I also used standard scaler to rescale the numeric values.

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df,df["quality"],test_size=0.3,shuffle=True, stratify=df["quality"])

pipeline =  make_pipeline(StandardScaler(), LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty="l2", C=5)) 
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")