In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


In [23]:
red_data = pd.read_csv("winequality-red.csv", sep=";")
white_data = pd.read_csv("winequality-white.csv", sep=";")


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
    red_data.drop(["quality"], axis=1), red_data["quality"], random_state=100
)
X_w_train, X_w_test, y_w_train, y_w_test = train_test_split(
    white_data.drop(["quality"], axis=1), white_data["quality"], random_state=100
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=100, max_iter=500)),
    ]
)
pipe.fit(X_r_train, y_r_train)
print(
    "Score for LogisticRegression red wine dataset  ",
    round(pipe.score(X_r_test, y_r_test), 4),
)

pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(random_state=100, max_iter=500)),
    ]
)
pipe.fit(X_w_train, y_w_train)
print(
    "Score for LogisticRegression white wine dataset",
    round(pipe.score(X_w_test, y_w_test), 4),
)

print("Set of possible labels for both datasets:", (set(y_w_train) | set(y_r_train)))

Score for LogisticRegression red wine dataset   0.6125
Score for LogisticRegression white wine dataset 0.5404
Set of possible labels for both datasets {3, 4, 5, 6, 7, 8, 9}


The main advantage of multi-class is that it much clearer, since the quality is actually discrete, and so a quality of 6.482234 would never exists in the real world. The disadvantage is that it cannot extrapolate, i.e. the multi-class model can't predict outside of the existing label set: $\{3,4,5,6,7,8,9\}$.

# Part 2


In [25]:
# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_label = pd.read_csv("test_label.csv")


In [26]:
# See non null count, and type of data
train.info()

# See two random entries
train.sample(2)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12870 entries, 0 to 12869
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         12870 non-null  int64 
 1   age        12870 non-null  int64 
 2   job        12870 non-null  object
 3   marital    12870 non-null  object
 4   education  12870 non-null  object
 5   default    12870 non-null  object
 6   balance    12870 non-null  int64 
 7   housing    12870 non-null  object
 8   loan       12870 non-null  object
 9   contact    12870 non-null  object
 10  day        12870 non-null  int64 
 11  month      12870 non-null  object
 12  campaign   12870 non-null  int64 
 13  pdays      12870 non-null  int64 
 14  previous   12870 non-null  int64 
 15  poutcome   12870 non-null  object
 16  y          12870 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.7+ MB


Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
3291,11495,45,services,married,secondary,no,4971,no,no,cellular,18,aug,9,-1,0,unknown,no
5283,19574,52,services,married,primary,no,2618,yes,no,cellular,21,nov,2,-1,0,unknown,no


In [27]:
# List of categorical columns
catetgorical = [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "poutcome",
]
# List of numerical columns
numerical = ["age", "balance", "campaign", "pdays", "day", "previous"]

# Column with wrong dtype
train["day"] = train["day"].astype(object)
test["day"] = test["day"].astype(object)

# ID is an unnecessary column for the model
train = train.drop(["ID"], axis=1)
test = test.drop(["ID"], axis=1)


In [69]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from typing import Union, List, Dict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils._testing import ignore_warnings
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

X_train = train.drop(["y"], axis=1)
y_train = train["y"]
X_test = test
y_test = test_label.drop(["ID"], axis=1)


def get_scaler_and_ohe(
    classifier: Union[LogisticRegression, LinearSVC, KNeighborsClassifier],
    scaler: Union[Union[StandardScaler, MinMaxScaler], None] = None,
) -> Pipeline:
    """Creates a pipeline, that one hot encoeds the categorical variables and scales the variables (if wanted). The names in the pipe are the same as the parameter names. I.e. scaler and classifier.

    Args:
        classifier (Union[LogisticRegression, LinearSVC, KNeighborsClassifier]): classifier
        scaler (Union[Union[StandardScaler, MinMaxScaler], None], optional): scaler to use. Defaults to None.

    Returns:
        Pipeline: Appropiate pipeline for the given parameters/models.
    """
    if scaler:
        ct = ColumnTransformer(
            [
                ("scaling", scaler, numerical),
                ("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical),
            ]
        )
        return Pipeline([("scaler", ct), ("classifier", classifier)])
    ct = ColumnTransformer(
        [("onehot", OneHotEncoder(handle_unknown="ignore"), catetgorical)]
    )
    return Pipeline([("scaler", ct), ("classifier", classifier)])


@ignore_warnings()
def grid_search_and_test(model: Pipeline, params: Dict[str, List[float]]) -> None:
    """Prints the best parameters found for the model, and the score on the test data"""
    gsc = GridSearchCV(model, param_grid=params, n_jobs=4, error_score=0.0)
    gsc.fit(X_train, y_train)

    print("Best parameters found are:", gsc.best_params_)
    model.set_params(**gsc.best_params_)
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    
    scores = {
        "Accuracy": [accuracy_score(y_test, pred)],
        "Macro Averaged Precision": [precision_score(y_test, pred, average="macro")],
        "Macro Average Recall": [recall_score(y_test, pred, average="macro")],
        "F1 Macro": [f1_score(y_test, pred, average="macro")],
        "Micro Average Precision": [precision_score(y_test, pred, average="micro")],
        "Micro Average Recall": [recall_score(y_test, pred, average="micro")],
        "F1 Micro": [f1_score(y_test, pred, average="micro")],
    }
    score_df = pd.DataFrame(scores)
    
    return score_df


def test_scalers(
    model: Union[LogisticRegression, LinearSVC, KNeighborsClassifier],
    params: Dict[str, List[float]],
    name: str = "",
) -> None:
    """Test the model on Standard Scaler, Min Max Scaler and no scaler, and report the appropiate scores."""
    print(f"No Scaler, {name}")
    pipe1 = get_scaler_and_ohe(model)
    df_none = grid_search_and_test(pipe1, params=params)

    print("~" * 60)
    print(f"Standard Scaler, {name}")
    pipe2 = get_scaler_and_ohe(scaler=StandardScaler(), classifier=model)
    df_std = grid_search_and_test(pipe2, params=params)

    print("~" * 60)
    print(f"MinMax Scaler, {name}")
    pipe3 = get_scaler_and_ohe(scaler=MinMaxScaler(), classifier=model)
    df_min_max = grid_search_and_test(pipe3, params=params)

    df_std["Scaler"] = ["Standard Scaler"]
    df_min_max["Scaler"] = ["Min Max Scaler"]
    df_none["Scaler"] = ["No Scaler"]

    df_comb = pd.concat([df_none, df_std, df_min_max])
    df_comb = df_comb.set_index("Scaler")

    return df_comb.transpose()

def df_diff_maker(data: pd.DataFrame) -> pd.DataFrame:
    """Function that gets the difference between no scaler and the two different type of scalers used. The difference is also scaled up to make it easier to comprehend."""
    data["diff_std"] = (data["Standard Scaler"] - data["No Scaler"])*1000
    data["diff_min_max"] = (data["Min Max Scaler"] - data["No Scaler"])*1000

    return data


In [70]:
df = test_scalers(
    LinearSVC(max_iter=1500, random_state=100),
    params={"classifier__C": np.logspace(-5, 1)},
    name="LinearSVC",
)
df_diff_maker(df)

No Scaler, LinearSVC
Best parameters found are: {'classifier__C': 0.14563484775012445}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Standard Scaler, LinearSVC
Best parameters found are: {'classifier__C': 3.2374575428176398}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, LinearSVC
Best parameters found are: {'classifier__C': 0.7906043210907702}


Scaler,No Scaler,Standard Scaler,Min Max Scaler,diff_std,diff_min_max
Accuracy,0.773712,0.77581,0.776276,2.097413,2.563505
Macro Averaged Precision,0.773394,0.775228,0.776474,1.834119,3.079773
Macro Average Recall,0.663385,0.667209,0.667546,3.823549,4.160363
F1 Macro,0.679555,0.68399,0.684424,4.43584,4.869213
Micro Average Precision,0.773712,0.77581,0.776276,2.097413,2.563505
Micro Average Recall,0.773712,0.77581,0.776276,2.097413,2.563505
F1 Micro,0.773712,0.77581,0.776276,2.097413,2.563505


In [71]:
df = test_scalers(
    LogisticRegression(max_iter=500, random_state=100),
    params={"classifier__C": np.logspace(-4, 1, num=70)},
    name="LogisticRegression",
)

df_diff_maker(df)

No Scaler, LogisticRegression
Best parameters found are: {'classifier__C': 0.8185467307069029}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Standard Scaler, LogisticRegression
Best parameters found are: {'classifier__C': 0.8185467307069029}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, LogisticRegression
Best parameters found are: {'classifier__C': 0.9671798642975443}


Scaler,No Scaler,Standard Scaler,Min Max Scaler,diff_std,diff_min_max
Accuracy,0.777208,0.779539,0.779539,2.330459,2.330459
Macro Averaged Precision,0.772669,0.773333,0.774518,0.663437,1.848531
Macro Average Recall,0.672206,0.677457,0.676617,5.250804,4.411572
F1 Macro,0.689456,0.695217,0.694387,5.761248,4.931145
Micro Average Precision,0.777208,0.779539,0.779539,2.330459,2.330459
Micro Average Recall,0.777208,0.779539,0.779539,2.330459,2.330459
F1 Micro,0.777208,0.779539,0.779539,2.330459,2.330459


In [72]:
df = test_scalers(
    KNeighborsClassifier(),
    params={
        "classifier__n_neighbors": np.linspace(1, 50, dtype=int),
        "classifier__weights": ["uniform", "distance"],
    },
    name="k-nearest neighbors",
)
df_diff_maker(df)

No Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 35, 'classifier__weights': 'uniform'}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Standard Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 19, 'classifier__weights': 'uniform'}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MinMax Scaler, k-nearest neighbors
Best parameters found are: {'classifier__n_neighbors': 21, 'classifier__weights': 'uniform'}


Scaler,No Scaler,Standard Scaler,Min Max Scaler,diff_std,diff_min_max
Accuracy,0.759497,0.768585,0.776742,9.08879,17.245397
Macro Averaged Precision,0.744538,0.752215,0.764553,7.67738,20.015172
Macro Average Recall,0.647448,0.665765,0.677114,18.317113,29.666361
F1 Macro,0.660273,0.68126,0.694235,20.987268,33.961829
Micro Average Precision,0.759497,0.768585,0.776742,9.08879,17.245397
Micro Average Recall,0.759497,0.768585,0.776742,9.08879,17.245397
F1 Micro,0.759497,0.768585,0.776742,9.08879,17.245397
