                                     Logistic Regression Model

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [118]:
# extracting the wine_quality dataset that has to be processed using read_csv from pandas
data_set = pd.read_csv(r"winequality_red.csv")

In [119]:
data_set.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [120]:
# copying the dataset to other
data_copy = data_set.copy()

Finding the min and max for quality of data_set, so that the quality can be categorised into good(1) or bad(0).

In [121]:
max(data_copy["quality"])

8

In [122]:
min(data_copy["quality"])

3

As the min quality is 3 and max is 8, Hence we divide into two equal sets, such as 3 to 5 is 0 and 6 to 8 is 1

In [123]:
data_copy["quality"] = np.where(data_copy["quality"]>5, 1, 0) # categorizing the quality based on range between 3 and 8
#i.e 3 to 5 = 0, 6 to 8 = 1

In [124]:
data_copy.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


Here we choose our dependent variable as quality

In [125]:
Y = data_copy["quality"] # dependent 

In [126]:
Y.head()

0    0
1    0
2    0
3    1
4    0
Name: quality, dtype: int32

Now choosing all other independent variables or features that affects wine by dropping off the quality columnn data

In [127]:
X = data_copy.drop(columns = ["quality"]) # dropping off the dependent variable column

In [128]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


As we can see in the above data, feature values differ from others. Hence normalising the data using min max feature scaling

In [129]:
for column in X:
    X[column] = (X[column] - X[column].min())/(X[column].max() - X[column].min())
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.04,0.09589,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.56,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.39726,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.606299,0.137725,0.153846


In [130]:
from sklearn.model_selection import train_test_split

In [131]:
# splitting dataset into 80% for training and 20% testing data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [132]:
len(x_train)

1279

In [133]:
len(x_test)

320

In [134]:
from sklearn.linear_model import LogisticRegression

Here using lbfgs solver, which is Limited memory Broyden Fletcher Goldfarb shanno. Approximates second derivative matrix updates with gradient evaluations.

In [135]:
model = LogisticRegression(solver='lbfgs', random_state=0)

In [147]:
model.fit(x_train, y_train) # training the model using logistic regression

LogisticRegression(random_state=0)

In [137]:
model.coef_

array([[ 1.58478834, -3.19136729, -0.30444188,  0.50735387, -1.31452232,
         0.84321816, -2.92954584, -1.185047  , -0.3102203 ,  2.87532968,
         4.90306791]])

In [138]:
y_pred = model.predict(x_test) # predict the label of data_set

In [139]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,

T=> TRUE <br>P=> POSITIVE <br>
F=> FALSE <br> N=> NEGATIVE

In [140]:
from sklearn.metrics import confusion_matrix # using confusion matrix for finding TP FP FN TN

In [141]:
confusion_mtx = confusion_matrix(y_test, y_pred)
confusion_mtx

array([[104,  42],
       [ 51, 123]], dtype=int64)

TP = confusion_mtx[0][0] <br>
TN = confusion_mtx[1][1] <br>
FP = confusion_mtx[0][1] <br>
FN = confusion_mtx[1][0]

In [142]:
TP, TN = confusion_mtx[0][0], confusion_mtx[1][1]
FP, FN = confusion_mtx[0][1], confusion_mtx[1][0]

In [143]:
print(f"TRUE POSITIVE: {TP}\nTRUE NEGATIVE: {TN}\nFALSE POSITIVE: {FP}\nFALSE NEGATIVE: {FN}")

TRUE POSITIVE: 104
TRUE NEGATIVE: 123
FALSE POSITIVE: 42
FALSE NEGATIVE: 51


In [144]:
accuracy_1 = (TP+TN)/(TP+TN+FP+FN)
accuracy_1

0.709375

In [145]:
from sklearn.metrics import accuracy_score # using accuracy_score

In [146]:
accuracy_2 = accuracy_score(y_test, y_pred)
accuracy_2

0.709375