### Importing Libraries

In [1]:
# import the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# read the csv file
df = pd.read_csv('WineQT.csv')

In [3]:
# show the first five rows
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


### EDA

In [4]:
# change the id of the dataframe
df.index = df.Id

In [5]:
# drop unwanted columns
df.drop('Id', axis=1, inplace=True)

In [6]:
# display the information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1143 entries, 0 to 1597
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 116.1 KB


In [7]:
# describe the data --> some statistical measurements
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [8]:
df.quality.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [9]:
df.shape

(1143, 12)

In [10]:
# Separate the data and target
X = df.drop("quality", axis=1)
y = df.quality

In [11]:
# split X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Building the model

In [12]:
# initialization of the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [13]:
# predict the new values
y_pred_logreg = logreg.predict(X_test)

In [14]:
# evaluate the model
print('Mean Squared Error for LogisticRegression: ', mean_squared_error(y_test, y_pred_logreg))

Mean Squared Error for LogisticRegression:  0.4672489082969432


In [15]:
data = pd.DataFrame({"Actual": y_test, "Prediction": y_pred_logreg, "Loss":abs(y_pred_logreg-y_test)})
data.head(30)

Unnamed: 0_level_0,Actual,Prediction,Loss
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
222,5,5,0
1514,6,5,1
417,5,5,0
754,6,6,0
516,6,6,0
1120,8,7,1
180,5,5,0
82,5,5,0
632,6,6,0
592,5,5,0


In [16]:
# init the model
rfc = RandomForestClassifier(n_estimators=200)

In [17]:
# fit the model to the train data
rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200)

In [18]:
# predict the test data
y_pred_rfc = rfc.predict(X_test)

In [19]:
print('Mean Squared Error RandomForestClassifier: ', mean_squared_error(y_test, y_pred_rfc))

Mean Squared Error RandomForestClassifier:  0.40611353711790393


In [20]:
data = pd.DataFrame({"Actual": y_test, "Prediction": y_pred_rfc, "Loss":abs(y_pred_rfc-y_test)})
data.head(30)

Unnamed: 0_level_0,Actual,Prediction,Loss
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
222,5,5,0
1514,6,6,0
417,5,5,0
754,6,5,1
516,6,5,1
1120,8,6,2
180,5,5,0
82,5,5,0
632,6,5,1
592,5,5,0


In [21]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')