In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


In [2]:
red_data = pd.read_csv("winequality-red.csv", sep=";")
white_data = pd.read_csv("winequality-white.csv", sep=";")


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
    red_data.drop(["quality"], axis=1), red_data["quality"], random_state=100
)
X_w_train, X_w_test, y_w_train, y_w_test = train_test_split(
    white_data.drop(["quality"], axis=1), white_data["quality"], random_state=100
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=100, max_iter=500)),
    ]
)
pipe.fit(X_r_train, y_r_train)
print(
    "Score for LogisticRegression red wine dataset  ",
    round(pipe.score(X_r_test, y_r_test), 4),
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=100, max_iter=500)),
    ]
)
pipe.fit(X_w_train, y_w_train)
print(
    "Score for LogisticRegression white wine dataset",
    round(pipe.score(X_w_test, y_w_test), 4),
)


Score for LogisticRegression red wine dataset   0.6125
Score for LogisticRegression white wine dataset 0.5404


# Part 2


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_label = pd.read_csv("test_label.csv")


In [5]:
train.info()
train.sample(2)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
3235,20049,33,management,single,tertiary,no,275,no,no,cellular,18,nov,1,-1,0,unknown,no
3540,16819,25,services,married,secondary,no,-97,yes,no,cellular,6,may,1,-1,0,unknown,no


In [6]:
# List of categorical columns
catetgorical = [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "day",
    "month",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
]
# List of numerical columns
numerical = ["age", "balance", "campaign", "pdays"]

# Column with wrong dtype
train["day"] = train["day"].astype(object)
test["day"] = test["day"].astype(object)

# ID is an unnecessary column for the model
train = train.drop(["ID"], axis=1)
test = test.drop(["ID"], axis=1)


In [101]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from typing import Union, List, Dict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils._testing import ignore_warnings
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

X_train = train.drop(["y"], axis=1)
y_train = train["y"]
X_test = test
y_test = test_label.drop(["ID"], axis=1)


def get_scaler_and_ohe(
    classifier: Union[LogisticRegression, LinearSVC, KNeighborsClassifier],
    scaler: Union[Union[StandardScaler, MinMaxScaler], None] = None,
) -> Pipeline:
    """Creates a pipeline, that one hot encoeds the categorical variables and scales the variables (if wanted). The names in the pipe are the same as the parameter names. I.e. scaler and classifier.

    Args:
        classifier (Union[LogisticRegression, LinearSVC, KNeighborsClassifier]): classifier
        scaler (Union[Union[StandardScaler, MinMaxScaler], None], optional): scaler to use. Defaults to None.

    Returns:
        Pipeline: Appropiate pipeline for the given parameters/models.
    """
    if scaler:
        ct = ColumnTransformer(
            [
                ("scaling", scaler, numerical),
                ("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical),
            ]
        )
        return Pipeline([("scaler", ct), ("classifier", classifier)])
    ct = ColumnTransformer(
        [("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical)]
    )
    return Pipeline([("scaler", ct), ("classifier", classifier)])


@ignore_warnings()
def grid_search_and_test(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=-1)
    gsc.fit(X_train, y_train)
    print("Best parameters found are:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    macro_p_r_f1_score = precision_recall_fscore_support(y_test, pred, average="macro")
    micro_p_r_f1_score = precision_recall_fscore_support(y_test, pred, average="micro")
    scores = {
        "Accuracy": [accuracy_score(y_test, pred)],
        "Macro Averaged Precision": [macro_p_r_f1_score[0]],
        "Macro Average Recall": [macro_p_r_f1_score[1]],
        "F1 Macro": [macro_p_r_f1_score[2]],
        "Micro Average Precision": [micro_p_r_f1_score[0]],
        "Micro Average Recall": [micro_p_r_f1_score[1]],
        "F1 Micro": [micro_p_r_f1_score[2]],
    }
    score_df = pd.DataFrame(scores)
    display(round(score_df,5))


def test_scalers(
    model: Union[LogisticRegression, LinearSVC, KNeighborsClassifier],
    params: Dict[str, List[float]],
    name: str = "",
) -> None:
    """Test the model on Standard Scaler, Min Max Scaler and no scaler, and report the appropiate scores."""
    print(f"Standard Scaler, {name}")
    pipe1 = get_scaler_and_ohe(scaler=StandardScaler(), classifier=model)
    grid_search_and_test(pipe1, params=params)

    print("~" * 60)
    print(f"MinMax Scaler, {name}")
    pipe2 = get_scaler_and_ohe(scaler=MinMaxScaler(), classifier=model)
    grid_search_and_test(pipe2, params=params)

    print("~" * 60)
    print(f"No Scaler, {name}")
    pipe3 = get_scaler_and_ohe(model)
    grid_search_and_test(pipe3, params=params)


In [102]:
test_scalers(LogisticRegression(max_iter=500), params={"classifier__C": np.linspace(0,4)}, name="LogisticRegression")

Standard Scaler, Logistic Regression
Best parameters found are: {'classifier__C': 0.4081632653061224}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.77931,0.76639,0.68232,0.69983,0.77931,0.77931,0.77931


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, Logistic Regression
Best parameters found are: {'classifier__C': 0.5714285714285714}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.77954,0.76619,0.68312,0.70063,0.77954,0.77954,0.77954


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
No Scaler, Logistic Regression
Best parameters found are: {'classifier__C': 0.4897959183673469}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.77931,0.76639,0.68232,0.69983,0.77931,0.77931,0.77931


In [104]:
test_scalers(LinearSVC(max_iter=1500), params={"classifier__C": np.linspace(1e-5,5)},name="LinearSVC")

Standard Scaler, LinearSVC
Best parameters found are: {'classifier__C': 0.10205061224489796}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.77977,0.7719,0.67909,0.69687,0.77977,0.77977,0.77977


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, LinearSVC
Best parameters found are: {'classifier__C': 0.10205061224489796}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.78117,0.77467,0.68052,0.69859,0.78117,0.78117,0.78117


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
No Scaler, LinearSVC
Best parameters found are: {'classifier__C': 0.10205061224489796}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.77837,0.76996,0.67703,0.69454,0.77837,0.77837,0.77837


In [106]:
test_scalers(KNeighborsClassifier(), params={"classifier__n_neighbors": [i for i in range(1,10)] + [i for i in range(10,30,4)]}, name="k-nearest neighbors")

Standard Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 26}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.76649,0.75753,0.65607,0.67062,0.76649,0.76649,0.76649


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 26}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.76416,0.75217,0.65396,0.66804,0.76416,0.76416,0.76416


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
No Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 26}


Unnamed: 0,Accuracy,Macro Averaged Precision,Macro Average Recall,F1 Macro,Micro Average Precision,Micro Average Recall,F1 Micro
0,0.76789,0.76244,0.65624,0.67095,0.76789,0.76789,0.76789
