In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv('winequality.csv')

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,11.6,0.58,0.66,2.2,0.074,10.0,47.0,1.0008,3.25,0.57,9.0,1
1,10.4,0.61,0.49,2.1,0.2,5.0,16.0,0.9994,3.16,0.63,8.4,1
2,7.4,1.185,0.0,4.25,0.097,5.0,14.0,0.9966,3.63,0.54,10.7,1
3,10.4,0.44,0.42,1.5,0.145,34.0,48.0,0.99832,3.38,0.86,9.9,1
4,8.3,1.02,0.02,3.4,0.084,6.0,11.0,0.99892,3.48,0.49,11.0,1


In [6]:
df['quality'].value_counts()

1    744
2    638
3    217
Name: quality, dtype: int64

## Multi-Class Statistics Approach

In [None]:
import statsmodels.api as sm

In [9]:
y = df['quality']
X = df.drop(['quality','pH'],axis = 1)

In [12]:
Xc = sm.add_constant(X)
model = sm.MNLogit(y,Xc).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.758777
         Iterations 9


0,1,2,3
Dep. Variable:,quality,No. Observations:,1599.0
Model:,MNLogit,Df Residuals:,1577.0
Method:,MLE,Df Model:,20.0
Date:,"Wed, 17 Feb 2021",Pseudo R-squ.:,0.2364
Time:,12:52:28,Log-Likelihood:,-1213.3
converged:,True,LL-Null:,-1588.8
Covariance Type:,nonrobust,LLR p-value:,3.385e-146

quality=2,coef,std err,z,P>|z|,[0.025,0.975]
const,22.7379,67.894,0.335,0.738,-110.332,155.808
fixed acidity,0.1361,0.068,1.987,0.047,0.002,0.270
volatile acidity,-3.0705,0.494,-6.220,0.000,-4.038,-2.103
citric acid,-1.4245,0.573,-2.488,0.013,-2.547,-0.302
residual sugar,0.0258,0.054,0.482,0.630,-0.079,0.131
chlorides,-2.7593,1.529,-1.805,0.071,-5.756,0.237
free sulfur dioxide,0.0223,0.008,2.694,0.007,0.006,0.039
total sulfur dioxide,-0.0151,0.003,-5.225,0.000,-0.021,-0.009
density,-30.8676,68.094,-0.453,0.650,-164.330,102.594
sulphates,2.3981,0.464,5.164,0.000,1.488,3.308


In [19]:
y_pred_prob = model.predict(Xc)
y_pred_prob.columns=['1','2','3']
y_pred_prob.head()

Unnamed: 0,1,2,3
0,0.844999,0.148639,0.006362
1,0.880592,0.117523,0.001885
2,0.812346,0.180316,0.007338
3,0.321525,0.604699,0.073776
4,0.679494,0.309228,0.011278


In [21]:
y_pred_prob['pred'] = y_pred_prob[['1','2','3']].idxmax(axis=1)
y_pred_prob.head()

Unnamed: 0,1,2,3,pred
0,0.844999,0.148639,0.006362,1
1,0.880592,0.117523,0.001885,1
2,0.812346,0.180316,0.007338,1
3,0.321525,0.604699,0.073776,2
4,0.679494,0.309228,0.011278,1


In [26]:
y_pred_prob['pred']=y_pred_prob['pred'].astype(int)

In [35]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [28]:
print(classification_report(y,y_pred_prob['pred']))

              precision    recall  f1-score   support

           1       0.71      0.78      0.74       744
           2       0.56      0.57      0.56       638
           3       0.57      0.36      0.44       217

    accuracy                           0.64      1599
   macro avg       0.61      0.57      0.58      1599
weighted avg       0.63      0.64      0.63      1599



## Ml Approach

**ML - OVR (One v/s Rest)**

In [29]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear')

In [30]:
lr.fit(X,y)

LogisticRegression(solver='liblinear')

In [31]:
y_pred = lr.predict(X)
y_prob = lr.predict_proba(X)

In [43]:
print(confusion_matrix(y,y_pred))
print('\nAccuracy Score',accuracy_score(y,y_pred))

[[582 157   5]
 [235 361  42]
 [ 14 132  71]]

Accuracy Score 0.6341463414634146


**ML - Multinomial**

In [38]:
lr = LogisticRegression(solver='lbfgs',multi_class='multinomial',max_iter=5000,n_jobs=-1)
lr.fit(X,y)

LogisticRegression(max_iter=5000, multi_class='multinomial', n_jobs=-1)

In [39]:
y_pred = lr.predict(X)
y_prob = lr.predict_proba(X)

In [42]:
print(confusion_matrix(y,y_pred))
print('\nAccuracy Score',accuracy_score(y,y_pred))

[[582 157   5]
 [235 361  42]
 [ 14 132  71]]

Accuracy Score 0.6341463414634146
