In [83]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [84]:
df = pd.read_csv("data/red_wine_quality.csv")

In [85]:
#From wine quality, it is can be classified as good & bad on the basis
# if it is <5.5 --> bad
# if it is >5.5 --> good

df['quality'] = df['quality'].apply(lambda x: 'good' if x > 5.5 else 'bad')

In [86]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


In [87]:
#Removing duplicates
df.drop_duplicates(inplace=True)

In [88]:
#checking shape
df.shape

(1359, 12)

In [89]:
#Separating Dependent & Independent Data

X = df.drop(labels = ["quality"], axis = 1)
Y = df[["quality"]]

In [90]:
#Defiinig the numerical cols for preprocessing
num_cols = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]
target_cols = ["quality"]


In [91]:
#Importing libs for simple imputer for missing values & Standard scaler for preprocessing

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [92]:
#For pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [93]:
#Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
         
    ]
)

In [94]:
#Label pipeline 
lab_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy = "most_frequent")),
        ("labelencoder", LabelEncoder())
    ]
) 

In [95]:
Y

Unnamed: 0,quality
0,bad
1,bad
2,bad
3,good
5,bad
...,...
1593,good
1594,bad
1595,good
1597,bad


In [96]:
Y["quality"] = LabelEncoder().fit_transform(Y["quality"])

In [97]:
Y

Unnamed: 0,quality
0,0
1,0
2,0
3,1
5,0
...,...
1593,1
1594,0
1595,1
1597,0


In [98]:
Y["quality"].value_counts()

quality
1    719
0    640
Name: count, dtype: int64

In [99]:
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_cols)
])

In [100]:
#Training Dataset
from sklearn.model_selection import train_test_split

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=100)

In [102]:
#Transforming the data

X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [103]:
#LabelEncoder 

In [104]:
X_train.head()

Unnamed: 0,num_pipeline__fixed acidity,num_pipeline__volatile acidity,num_pipeline__citric acid,num_pipeline__residual sugar,num_pipeline__chlorides,num_pipeline__free sulfur dioxide,num_pipeline__total sulfur dioxide,num_pipeline__density,num_pipeline__pH,num_pipeline__sulphates,num_pipeline__alcohol
0,-0.814164,0.831393,-1.389797,-0.011751,0.276543,0.551176,-0.311659,-0.050228,0.987656,-0.508962,0.162165
1,-0.238627,-1.572634,0.624678,-0.702978,-0.616997,-0.501659,-0.667325,0.0034,0.087919,1.12441,-1.613689
2,-0.065966,-1.194473,0.624678,-0.24216,-0.580526,-0.980221,-1.111907,-0.715224,0.023652,0.089941,0.722961
3,-0.583949,2.046913,-0.886178,-0.549372,-0.288758,0.359751,0.251478,0.014126,0.28072,-0.563407,-1.146359
4,-0.75661,0.372197,-0.785455,-0.24216,-0.124638,-0.310235,-0.578409,-0.050228,1.373258,-0.236733,-0.211699


In [105]:
## Model Training 

from sklearn.linear_model import LogisticRegression, RidgeClassifier, LogisticRegressionCV, LassoCV, ElasticNetCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [106]:
#Model Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [133]:
def evaluate_model(true, pred):
    acc = accuracy_score(true, pred)
    report = classification_report(true, pred)
    matrix = confusion_matrix(true, pred)
    return f"{acc*100}%", report, matrix

In [134]:
#Model Trainer 

models = {
    "LogisticRegression" : LogisticRegression(),
    "RidgeClassifier" : RidgeClassifier(),
    "LogisticRegressionCV" : LogisticRegressionCV(),
    "DecisionTree" : DecisionTreeClassifier(),
    "SVC" : SVC(),
    "RandomForest" : RandomForestClassifier(),
    "GradientClassifier" : GradientBoostingClassifier()
}

In [135]:
model_list = []
r2_list = []

In [136]:
list(models.values())

[LogisticRegression(),
 RidgeClassifier(),
 LogisticRegressionCV(),
 DecisionTreeClassifier(),
 SVC(),
 RandomForestClassifier(),
 GradientBoostingClassifier()]

In [137]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    acc, report, matrix = evaluate_model(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Training Performance")
    print("accuracy", acc*100, "%")
    print("Report", report)
    print("matrix", matrix)
    
    r2_list.append(matrix)
    
    print("*"*100)

LogisticRegression
Model Training Performance
accuracy 73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.77450980392157%73.774509

In [None]:
model_list

['LogisticRegression',
 'RidgeClassifier',
 'DecisionTree',
 'SVC',
 'RandomForest',
 'GradientClassifier']

In [138]:
#Model list
models = [LogisticRegression(),
 RidgeClassifier(),
 LogisticRegressionCV(),
 DecisionTreeClassifier(),
 SVC(),
 RandomForestClassifier(),
 GradientBoostingClassifier()]

In [140]:
for i in range(len(models)):
    model = models[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc, report, matrix = evaluate_model(y_test, y_pred)
    print(f"{models[i]}")
    print(acc)
    print(matrix)
    print("*"*100)
    
    

LogisticRegression()
73.77450980392157%
[[146  58]
 [ 49 155]]
****************************************************************************************************
RidgeClassifier()
74.26470588235294%
[[149  55]
 [ 50 154]]
****************************************************************************************************
LogisticRegressionCV()
72.79411764705883%
[[142  62]
 [ 49 155]]
****************************************************************************************************
DecisionTreeClassifier()
70.58823529411765%
[[133  71]
 [ 49 155]]
****************************************************************************************************
SVC()
76.7156862745098%
[[147  57]
 [ 38 166]]
****************************************************************************************************
RandomForestClassifier()
75.98039215686273%
[[143  61]
 [ 37 167]]
****************************************************************************************************
GradientBoostingClassifie