In [6]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px

In [7]:
#reading dataset
df=pd.read_csv('datasets_4458_8204_winequality-red.csv')

In [8]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [9]:
print(df.isna().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [17]:
correlations = df.corr()['quality'].drop('quality')
print(correlations)

fixed acidity           0.124052
volatile acidity       -0.390558
citric acid             0.226373
residual sugar          0.013732
chlorides              -0.128907
free sulfur dioxide    -0.050656
total sulfur dioxide   -0.185100
density                -0.174919
pH                     -0.057731
sulphates               0.251397
alcohol                 0.476166
Name: quality, dtype: float64


In [20]:
# A classification Problem
df['goodquality'] = [1 if x >= 7 else 0 for x in df['quality']]

In [21]:
df['goodquality'].value_counts()

0    1382
1     217
Name: goodquality, dtype: int64

In [23]:
X = df.drop(['quality','goodquality'], axis = 1)
y = df['goodquality']

In [24]:
# Normalize feature variables
from sklearn.preprocessing import StandardScaler
X_features = X
X = StandardScaler().fit_transform(X)

In [25]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [26]:
#Decision tree
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [27]:
model1 = DecisionTreeClassifier(random_state=1)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       355
           1       0.53      0.73      0.62        45

    accuracy                           0.90       400
   macro avg       0.75      0.83      0.78       400
weighted avg       0.92      0.90      0.90       400



In [31]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier

In [32]:
model3 = AdaBoostClassifier(random_state=1)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [33]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       355
           1       0.51      0.49      0.50        45

    accuracy                           0.89       400
   macro avg       0.72      0.71      0.72       400
weighted avg       0.89      0.89      0.89       400



In [34]:
#Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

In [35]:
model4 = GradientBoostingClassifier(random_state=1)
model4.fit(X_train, y_train)
y_pred4 = model4.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       355
           1       0.52      0.51      0.52        45

    accuracy                           0.89       400
   macro avg       0.73      0.73      0.73       400
weighted avg       0.89      0.89      0.89       400



In [39]:
#Good vs bad
# Filtering df for only good quality
df_temp = df[df['goodquality']==1]
df_temp.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,goodquality
count,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0,217.0
mean,8.847005,0.40553,0.376498,2.708756,0.075912,13.981567,34.889401,0.99603,3.288802,0.743456,11.518049,7.082949,1.0
std,1.999977,0.144963,0.194438,1.363026,0.02848,10.234615,32.572238,0.002201,0.154478,0.134038,0.998153,0.276443,0.0
min,4.9,0.12,0.0,1.2,0.012,3.0,7.0,0.99064,2.88,0.39,9.2,7.0,1.0
25%,7.4,0.3,0.3,2.0,0.062,6.0,17.0,0.9947,3.2,0.65,10.8,7.0,1.0
50%,8.7,0.37,0.4,2.3,0.073,11.0,27.0,0.99572,3.27,0.74,11.6,7.0,1.0
75%,10.1,0.49,0.49,2.7,0.085,18.0,43.0,0.99735,3.38,0.82,12.2,7.0,1.0
max,15.6,0.915,0.76,8.9,0.358,54.0,289.0,1.0032,3.78,1.36,14.0,8.0,1.0


In [40]:
# Filtering df for only bad quality
df_temp2 = df[df['goodquality']==0]
df_temp2.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,goodquality
count,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0,1382.0
mean,8.236831,0.547022,0.254407,2.51212,0.089281,16.172214,48.285818,0.996859,3.314616,0.644754,10.251037,5.408828,0.0
std,1.682726,0.176337,0.189665,1.415778,0.049113,10.467685,32.585604,0.001808,0.154135,0.170629,0.969664,0.601719,0.0
min,4.6,0.16,0.0,0.9,0.034,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.42,0.0825,1.9,0.071,8.0,23.0,0.995785,3.21,0.54,9.5,5.0,0.0
50%,7.8,0.54,0.24,2.2,0.08,14.0,39.5,0.9968,3.31,0.6,10.0,5.0,0.0
75%,9.1,0.65,0.4,2.6,0.091,22.0,65.0,0.9979,3.41,0.7,10.9,6.0,0.0
max,15.9,1.58,1.0,15.5,0.611,72.0,165.0,1.00369,4.01,2.0,14.9,6.0,0.0


In [47]:
#Data pre-processing
# dividing the dataset into dependent and independent variables
x = df.iloc[:,:11]
y = df.iloc[:,11]

# determining the shape of x and y.
print(x.shape)
print(y.shape)

(1599, 11)
(1599,)


In [50]:
# dividing the dataset in training and testing set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 44)

In [51]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [52]:
#Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GridSearchCV, cross_val_score



# creating the model
model = LogisticRegression()

# feeding the training set into the model
model.fit(x_train, y_train)

# predicting the results for the test set
y_pred = model.predict(x_test)

# calculating the training and testing accuracies
print("Training accuracy :", model.score(x_train, y_train))
print("Testing accuracy :", model.score(x_test, y_test))

# classification report
print(classification_report(y_test, y_pred))

# confusion matrix
print(confusion_matrix(y_test, y_pred))

Training accuracy : 0.6163469557964971
Testing accuracy : 0.5725
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        15
           5       0.69      0.70      0.70       186
           6       0.49      0.59      0.54       147
           7       0.39      0.27      0.32        44
           8       0.00      0.00      0.00         5

    accuracy                           0.57       400
   macro avg       0.26      0.26      0.26       400
weighted avg       0.54      0.57      0.56       400

[[  0   1   1   1   0   0]
 [  0   0   8   7   0   0]
 [  0   2 130  53   1   0]
 [  0   0  44  87  16   0]
 [  0   0   5  27  12   0]
 [  0   0   0   3   2   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
#Support vector machine
from sklearn.svm import SVC

# creating the model
model = SVC()

# feeding the training set into the model
model.fit(x_train, y_train)

# predicting the results for the test set
y_pred = model.predict(x_test)

# calculating the training and testing accuracies
print("Training accuracy :", model.score(x_train, y_train))
print("Testing accuracy :", model.score(x_test, y_test))

Training accuracy : 0.6822351959966639
Testing accuracy : 0.595


In [54]:
# finding the best parameters
param = {
    'C': [0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel':['linear', 'rbf'],
    'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = GridSearchCV(model, param_grid = param, scoring = 'accuracy', cv = 10)

In [56]:
grid_svc.fit(x_train, y_train)



GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4],
                         'gamma': [0.1, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [57]:
grid_svc.best_params_

{'C': 1.4, 'gamma': 0.8, 'kernel': 'rbf'}

In [58]:
model2 = SVC(C = 1.4, gamma = 0.1, kernel = 'rbf')
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        15
           5       0.68      0.72      0.70       186
           6       0.52      0.64      0.57       147
           7       0.57      0.30      0.39        44
           8       0.00      0.00      0.00         5

    accuracy                           0.60       400
   macro avg       0.29      0.28      0.28       400
weighted avg       0.57      0.60      0.58       400



  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
#Multi Layer Perceptron
from sklearn.neural_network import MLPClassifier

# creating the model
model = MLPClassifier(hidden_layer_sizes = (100, 100), max_iter = 150)

# feeding the training data to the model
model.fit(x_train, y_train)

# calculating the accuracies
print("training accuracy :", model.score(x_train, y_train))
print("testing accuracy :", model.score(x_test, y_test))

training accuracy : 0.8582151793160967
testing accuracy : 0.6


