In [89]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import PolynomialFeatures
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [68]:
df_white_wine = pd.read_csv("white_wine.csv")
df_white_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [69]:
df_white_wine['quality'] = np.where(df_white_wine['quality'] > 6.0, 1, 0)
df_white_wine = df_white_wine.dropna()
df_white_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


In [70]:
data_classification = df_white_wine[['density', 'alcohol', 'quality']].copy()
data_classification

Unnamed: 0,density,alcohol,quality
0,1.0010,8.8,0
1,0.9940,9.5,0
2,0.9951,10.1,0
3,0.9956,9.9,0
4,0.9956,9.9,0
...,...,...,...
514,1.0002,10.3,1
515,0.9926,10.4,0
516,0.9934,9.1,0
517,0.9920,10.4,0


In [71]:
# Melihat Summary Numericalnya
jumlah_label_0 = data_classification[data_classification['quality'] == 0]['quality'].count()
jumlah_label_1 = data_classification[data_classification['quality'] == 1]['quality'].count()
jumlah_total_data = data_classification['quality'].count()
print(jumlah_label_0, jumlah_label_1, jumlah_total_data)


421 98 519


In [72]:
X = data_classification[['density', 'alcohol']].copy()
y = data_classification['quality'].copy()

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2020, test_size=0.2)

In [74]:
logistic_regression_model = LogisticRegression().fit(X=X_train, y=y_train)

In [75]:
y_pred = logistic_regression_model.predict(X_test)

In [76]:
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.92        84
           1       0.73      0.40      0.52        20

    accuracy                           0.86       104
   macro avg       0.80      0.68      0.72       104
weighted avg       0.84      0.86      0.84       104



In [77]:
polynomial_transform = PolynomialFeatures(
    degree = 3,
    interaction_only = False,
)

In [78]:
polynomial_transform.fit(X_train)

In [79]:
X_train_poly = polynomial_transform.transform(X_train)
X_test_poly = polynomial_transform.transform(X_test)

In [80]:
logistic_regression_model_polynomial = LogisticRegression(max_iter=1000).fit(X=X_train_poly, y=y_train)

In [81]:
y_pred_polynomial = logistic_regression_model_polynomial.predict(X_test_poly)

In [82]:
print("Logistic Regression Classification Report with Polynomial:\n", classification_report(y_test, y_pred_polynomial))

Logistic Regression Classification Report with Polynomial:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97        84
           1       1.00      0.75      0.86        20

    accuracy                           0.95       104
   macro avg       0.97      0.88      0.91       104
weighted avg       0.95      0.95      0.95       104



Imbalanced Classification: Undersampling

In [83]:
rus = RandomUnderSampler(random_state=2020)
X_undersampled, y_undersampled = rus.fit_resample(X, y)

In [84]:
print(X_undersampled.count(), y_undersampled.count())

density    196
alcohol    196
dtype: int64 196


In [85]:
X_train_undersampled, X_test_undersampled, y_train_undersampled, y_test_undersampled = train_test_split(X_undersampled, y_undersampled, test_size=0.2, random_state=2020)

In [86]:
logistic_regression_undersampled_model = LogisticRegression().fit(X=X_train_undersampled, y=y_train_undersampled)

In [87]:
y_pred_logistic_regression_undersampled = logistic_regression_undersampled_model.predict(X=X_test_undersampled)

In [88]:
print("Logistic Regression Classification Report with undersampled:\n", classification_report(y_true=y_test_undersampled, y_pred=y_pred_logistic_regression_undersampled))

Logistic Regression Classification Report with undersampled:
               precision    recall  f1-score   support

           0       0.62      0.89      0.73        18
           1       0.86      0.55      0.67        22

    accuracy                           0.70        40
   macro avg       0.74      0.72      0.70        40
weighted avg       0.75      0.70      0.69        40



Imbalanced Classification: Oversampling

In [91]:
ros = RandomOverSampler(random_state=2020)
X_oversampled, y_oversampled = ros.fit_resample(X,y)

In [92]:
print(X_oversampled.count(), y_oversampled.count())

density    842
alcohol    842
dtype: int64 842


In [93]:
X_train_oversampled, X_test_oversampled, y_train_oversampled, y_test_oversampled = train_test_split(X_oversampled, y_oversampled, test_size=0.2, random_state=2020)

In [94]:
logistic_regression_oversampled_model = LogisticRegression().fit(X=X_train_oversampled, y=y_train_oversampled)

In [95]:
y_pred_logistic_regression_oversampled = logistic_regression_oversampled_model.predict(X=X_test_oversampled)

In [96]:
print("Logistic Regression Classification Report with undersampled:\n", classification_report(y_true=y_test_oversampled, y_pred=y_pred_logistic_regression_oversampled))

Logistic Regression Classification Report with undersampled:
               precision    recall  f1-score   support

           0       0.68      0.80      0.74        82
           1       0.78      0.64      0.70        87

    accuracy                           0.72       169
   macro avg       0.73      0.72      0.72       169
weighted avg       0.73      0.72      0.72       169

