# Importing libraries

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [25]:
df = pd.read_csv('red_wine_quality_data\wine_quality_training.csv')
df.columns

Index(['Unnamed: 0', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality_labels'],
      dtype='object')

In [26]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [27]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality_labels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [47]:
df[df['quality_labels']==1]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality_labels
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
7,7.3,0.65,0.00,1.2,0.065,15.0,21.0,0.99460,3.39,0.47,10.0,1
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.99680,3.36,0.57,9.5,1
16,8.5,0.28,0.56,1.8,0.092,35.0,103.0,0.99690,3.30,0.75,10.5,1
19,7.9,0.32,0.51,1.8,0.341,17.0,56.0,0.99690,3.04,1.08,9.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1592,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,1
1593,6.8,0.62,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,1
1595,5.9,0.55,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,1
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,1


# Feature importance using Chi-square test

In [28]:
# dividing the data into predictors and target

X = df.iloc[:,:-1]
y = df['quality_labels']

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

best_ft = SelectKBest(score_func = chi2, k = 10)
fit = best_ft.fit(X,y)
feat = pd.DataFrame(fit.scores_)
score = pd.DataFrame(X.columns)
new_df = pd.concat([score,feat], axis = 1)
new_df.columns = ['FEATURES','SCORE']
new_df.sort_values(by='SCORE',ascending=False)

Unnamed: 0,FEATURES,SCORE
6,total sulfur dioxide,2002.3059
5,free sulfur dioxide,42.005907
10,alcohol,32.908633
1,volatile acidity,10.02971
2,citric acid,5.666703
0,fixed acidity,5.265257
9,sulphates,3.317617
4,chlorides,0.485196
3,residual sugar,0.00584
7,density,0.000145


In [30]:
# splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25,random_state=42)

# Using select few classification algorithms such as -

* Logistic regression

* Decision tree

* Random forest 

* Bernoulli Naive-Bayes 

* k-Nearest Neighbor 

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
bnb = BernoulliNB()
knn = KNeighborsClassifier(n_neighbors=5)

In [32]:
def metrics_evaluator(alg):
    alg.fit(X_train,y_train)
    y_pred_test = alg.predict(X_test)
    y_pred_train = alg.predict(X_train)
    print(classification_report(y_pred_test,y_test)) 
    print('Testing Accuracy:', accuracy_score(y_pred_test,y_test))
    print('Training Accuracy:', accuracy_score(y_pred_train,y_train))

In [33]:
metrics_evaluator(log_reg)

              precision    recall  f1-score   support

           0       0.71      0.69      0.70       183
           1       0.74      0.76      0.75       217

    accuracy                           0.73       400
   macro avg       0.73      0.72      0.72       400
weighted avg       0.73      0.73      0.73       400

Testing Accuracy: 0.7275
Training Accuracy: 0.7547956630525438


In [34]:
metrics_evaluator(dtc)

              precision    recall  f1-score   support

           0       0.74      0.71      0.72       184
           1       0.76      0.78      0.77       216

    accuracy                           0.75       400
   macro avg       0.75      0.75      0.75       400
weighted avg       0.75      0.75      0.75       400

Testing Accuracy: 0.75
Training Accuracy: 1.0


In [35]:
metrics_evaluator(rfc)

              precision    recall  f1-score   support

           0       0.75      0.77      0.76       172
           1       0.82      0.80      0.81       228

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400

Testing Accuracy: 0.79
Training Accuracy: 1.0


In [36]:
metrics_evaluator(bnb)

              precision    recall  f1-score   support

           0       0.11      0.56      0.18        34
           1       0.93      0.57      0.70       366

    accuracy                           0.56       400
   macro avg       0.52      0.56      0.44       400
weighted avg       0.86      0.56      0.66       400

Testing Accuracy: 0.565
Training Accuracy: 0.5312760633861552


In [37]:
metrics_evaluator(knn)

              precision    recall  f1-score   support

           0       0.58      0.57      0.58       182
           1       0.65      0.66      0.65       218

    accuracy                           0.62       400
   macro avg       0.62      0.62      0.62       400
weighted avg       0.62      0.62      0.62       400

Testing Accuracy: 0.62
Training Accuracy: 0.7731442869057548


Random forest gives us the most accurate values at 79%. But the training data is overfitting so we choose the <b>Logistic Regression</b> because it generalises well.

Looking for the best parameters for tuning.

In [39]:
from sklearn.model_selection import GridSearchCV
 
# Creating the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
 
# Instantiating logistic regression classifier
logreg_test = LogisticRegression()
 
# Instantiating the GridSearchCV object
logreg_cv = GridSearchCV(logreg_test, param_grid, cv = 5)
 
logreg_cv.fit(X_train, y_train)
 
# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Best score is {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameters: {'C': 3.727593720314938}
Best score is 0.7506834030683402


In [41]:
logreg_final = LogisticRegression(C= 3.727593720314938)
metrics_evaluator(logreg_final)

              precision    recall  f1-score   support

           0       0.72      0.70      0.71       183
           1       0.75      0.77      0.76       217

    accuracy                           0.74       400
   macro avg       0.74      0.73      0.73       400
weighted avg       0.74      0.74      0.74       400

Testing Accuracy: 0.7375
Training Accuracy: 0.7522935779816514


The tuned Logistic Regression model seems to have generalised even better however, the accuracy hasn't improved significantly. We will go ahead with the Logistic Regression algorithm solely based on its ability to generalise as opposed to its other classification counterparts.

# Dumping the tuned Log Reg model

In [42]:
logreg_final.fit(X_train,y_train)

LogisticRegression(C=3.727593720314938)

In [43]:
import pickle

# dumping the random forest model:

#writing the pickle file
with open("model.pkl","wb") as f:
    pickle.dump(logreg_final,f)

In [44]:
# testing the model

with open("model.pkl","rb") as f:
    pred = pickle.load(f)

In [45]:
pred.predict([[7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4]])

array([0], dtype=int64)