# Author
Noah Call - A02361280

### Notes:
This code assumes data comes from "../data/wine-quality-white.csv". This can be changed in the first block of code Config & Imports

## Config & Imports

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

data_path = "../data/wine-quality-white.csv"
print("Config and imports compleated without issue. Proceed.")

Config and imports compleated without issue. Proceed.


## 1.

In [18]:
# Load dataset
df = pd.read_csv(data_path, sep=';')

# Create the binary target variable
df['y'] = df['quality'].apply(lambda x: 0 if x <= 5 else 1)

# Check the result
df[['quality', 'y']].head()

Unnamed: 0,quality,y
0,6,1
1,6,1
2,6,1
3,6,1
4,6,1


## 2.

In [19]:
# Calculate class imbalance ratio
class_counts = df['y'].value_counts(normalize=True)
class_counts_0 = class_counts[0]  # Proportion of 0's
class_counts_1 = class_counts[1]  # Proportion of 1's

print(f"Class 0 proportion: {class_counts_0}")
print(f"Class 1 proportion: {class_counts_1}")

Class 0 proportion: 0.33483054307880766
Class 1 proportion: 0.6651694569211923


## 3.

In [20]:
# Split test set (20%)
train_val, test = train_test_split(df, test_size=0.2, stratify=df['y'], random_state=42)

# Split the remaining data into train (60%) and validation (20%)
train, val = train_test_split(train_val, test_size=0.25, stratify=train_val['y'], random_state=42)

# Check the sizes
print(f"Train set size: {train.shape[0]}")
print(f"Validation set size: {val.shape[0]}")
print(f"Test set size: {test.shape[0]}")

Train set size: 2938
Validation set size: 980
Test set size: 980


## 4.

In [21]:
# Features (excluding 'quality' and 'y' (the last two columns))
features = df.columns[:-2]

# Initialize the scaler and fit on the training data
scaler = StandardScaler()
scaler.fit(train[features])

# Apply the scaler on train, validation, and test sets
train_scaled = train.copy()
val_scaled = val.copy()
test_scaled = test.copy()

train_scaled[features] = scaler.transform(train[features])
val_scaled[features] = scaler.transform(val[features])
test_scaled[features] = scaler.transform(test[features])

# Check the result
train_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,y
4755,-0.323928,0.125337,3.317193,-0.814913,-0.856259,-0.814745,-1.048715,-1.33309,-0.444481,-0.104704,1.514126,6,1
4003,-0.084927,-1.383225,0.143173,-0.95317,0.075184,0.281572,-0.470623,-0.963156,-0.779307,1.950671,0.458833,6,1
2351,1.229578,-0.276946,0.059646,1.002178,-0.250821,1.031683,1.494889,1.026485,-0.511447,0.666062,-0.758812,6,1
2434,1.70758,0.024766,-0.525042,2.206988,-0.01796,0.743179,2.674196,2.046301,-1.114133,0.580421,-1.164694,6,1
299,-0.204428,-1.081512,1.396076,-0.834664,-0.111104,-0.46854,-0.355005,-0.566561,-0.243586,-0.361626,-0.190577,6,1


## 5.

In [22]:
# Function to calculate and print the evaluation metrics
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    return accuracy, precision, recall, f1

# Prepare features and target for training and validation
X_train = train_scaled[features]
y_train = train_scaled['y']
X_val = val_scaled[features]
y_val = val_scaled['y']

results = []

# 1. kNN Classifier
for n_neighbors in [1, 3, 5]:
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(knn, X_val, y_val)
    results.append(('kNN', n_neighbors, accuracy, precision, recall, f1))

# 2. SVM Classifier
for kernel in ['rbf', 'linear', 'poly']:
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(svm, X_val, y_val)
    results.append(('SVM', kernel, accuracy, precision, recall, f1))

# 3. Decision Tree Classifier
for criterion in ['gini', 'entropy']:
    tree = DecisionTreeClassifier(criterion=criterion)
    tree.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(tree, X_val, y_val)
    results.append(('Decision Tree', criterion, accuracy, precision, recall, f1))

# 4. Logistic Regression
for penalty in ['l1', 'l2']:
    if penalty == 'l1':
        logreg = LogisticRegression(penalty=penalty, solver='liblinear')
    else:
        logreg = LogisticRegression(penalty=penalty, solver='lbfgs')
        
    logreg.fit(X_train, y_train)
    accuracy, precision, recall, f1 = evaluate_model(logreg, X_val, y_val)
    results.append(('Logistic Regression', penalty, accuracy, precision, recall, f1))

# Create a DataFrame for the results
results_df = pd.DataFrame(results, columns=['Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1'])
results_df

Unnamed: 0,Classifier,Parameter,Accuracy,Precision,Recall,F1
0,kNN,1,0.771429,0.812865,0.852761,0.832335
1,kNN,3,0.753061,0.799708,0.838957,0.818862
2,kNN,5,0.758163,0.802038,0.845092,0.823002
3,SVM,rbf,0.776531,0.81241,0.863497,0.837175
4,SVM,linear,0.755102,0.773936,0.892638,0.82906
5,SVM,poly,0.738776,0.744444,0.924847,0.824897
6,Decision Tree,gini,0.765306,0.80848,0.84816,0.827844
7,Decision Tree,entropy,0.781633,0.830816,0.843558,0.837139
8,Logistic Regression,l1,0.75102,0.780992,0.869632,0.822932
9,Logistic Regression,l2,0.75102,0.780992,0.869632,0.822932


## 6.

In [23]:
def get_best_models(results_df, metric='F1'):

    # Get the best F1 score
    best_f1_score = results_df[metric].max()
    
    # Filter the results to only include models with the best F1 score
    best_models = results_df[results_df[metric] == best_f1_score]
    
    return best_models

# Get the best model(s) based on F1 score
best_model_df = get_best_models(results_df, metric='F1')

# Display the best model(s)
print("selected model(s): ")
best_model_df

selected model(s): 


Unnamed: 0,Classifier,Parameter,Accuracy,Precision,Recall,F1
3,SVM,rbf,0.776531,0.81241,0.863497,0.837175


## 7.

In [24]:
def evaluate_on_test(best_model_df, X_test, y_test):
    best_model_info = best_model_df.iloc[0]
    
    classifier = best_model_info['Classifier']
    parameter = best_model_info['Parameter']
    
    # Re-initialize the model based on the best one
    if classifier == 'kNN':
        model = KNeighborsClassifier(n_neighbors=parameter)
    elif classifier == 'SVM':
        model = SVC(kernel=parameter)
    elif classifier == 'Decision Tree':
        model = DecisionTreeClassifier(criterion=parameter)
    elif classifier == 'Logistic Regression':
        if parameter == 'l1':
            model = LogisticRegression(penalty=parameter, solver='liblinear')
        else:
            model = LogisticRegression(penalty=parameter, solver='lbfgs')
    
    # Train the best model on the full training data
    X_train_full = train_scaled[features]
    y_train_full = train_scaled['y']
    
    model.fit(X_train_full, y_train_full)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Store results in a DataFrame
    results_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
        'Score': [accuracy, precision, recall, f1]
    })

    conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
    
    return results_df, conf_matrix_df


# Call the function on the test data and print the results
metrics_df, confusion_matrix_df = evaluate_on_test(best_model_df, X_test=test_scaled[features], y_test=test_scaled['y'])

# Display the evaluation metrics
print("Metrics: \n", metrics_df)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix_df)

Metrics: 
       Metric     Score
0   Accuracy  0.765306
1  Precision  0.806686
2     Recall  0.851227
3   F1 Score  0.828358

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          195          133
Actual 1           97          555
