# Lab 05: Wine Quality

---
author: Your Name Here
date: October 5, 2024
embed-resources: true
---

## Introduction

## Methods

In [162]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.datasets import make_friedman1
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score, root_mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

### Data

In [1]:
# load data
import pandas as pd
ames_train = pd.read_csv(
    "https://cs307.org/lab-04/data/ames-train.csv",
)
ames_test = pd.read_csv(
    "https://cs307.org/lab-04/data/ames-test.csv",
)

In [2]:
# summary statistics


In [None]:
# visualizations

### Models

In [3]:
# process data for ML
X_train = ames_train.drop("SalePrice", axis=1)
y_train = ames_train["SalePrice"]

# create X and y for test dataset
X_test = ames_test.drop("SalePrice", axis=1)
y_test = ames_test["SalePrice"]

In [138]:
def columns_missing_above_threshold(df, threshold):
    missing_proportion = df.isnull().sum() / len(df)
    columns_above_threshold = missing_proportion[missing_proportion > threshold].index.tolist()
    return columns_above_threshold

# find columns with string data type (will be considered categorical)
string_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
string_columns = [col for col in string_columns if col not in exclude_columns]

# find columns with numeric data type
numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in exclude_columns]

In [172]:
# train models

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
many_missing_columns = columns_missing_above_threshold(X_train, 0.6)
exclude_columns = ["Order", "PID"] + many_missing_columns

#pipeline for numeric
numeric_preprocessor = Pipeline(
    steps=[
        ("MedianImputer", SimpleImputer(strategy="median")),
        ("Standardize", StandardScaler()),
    ]
)

#pipeline for categorial
categorical_preprocessor = Pipeline(
    steps=[
        ("ModalImputer", SimpleImputer(strategy="most_frequent")),
        ("OneHotEncoder", OneHotEncoder(max_categories=5, handle_unknown='infrequent_if_exist')),
    ]
)
#column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("ExcludeColumns", "drop", exclude_columns),
        ("CategoricalPreprocessing", categorical_preprocessor, string_columns),
        ("NumericProcessing", numeric_preprocessor, numeric_columns),
    ],
    remainder="drop",
)

#full pipeline
pipeline = Pipeline(
    steps=[
        ("Preprocessor", preprocessor),
        ("Regressor", DummyRegressor()),
    ]
)

In [173]:
pipeline

In [174]:
param_grid = [
    {
        "Regressor": [DummyRegressor()],
    },
    {
        "Regressor": [LinearRegression()],
    },
    {
        "Regressor": [KNeighborsRegressor()],
        "Regressor__n_neighbors": [1, 3, 5, 7, 9, 11, 15],
    },
    {
        "Regressor": [DecisionTreeRegressor()],
        "Regressor__max_depth": [1, 3, 5, 7, 9, 11, 15, None],
    },
    {
        "Regressor": [HistGradientBoostingRegressor()],
        "Regressor__learning_rate": [0.1, 0.01, 0.001],
        "Regressor__max_iter": [1000],
        "Regressor__max_depth": [None, 3],
        "Regressor__l2_regularization": [0.1, 1.0],
    },
]

In [175]:
mod = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    n_jobs=-1,
    cv=5,
    verbose=2,
    scoring="neg_mean_absolute_percentage_error",
)

In [176]:
mod.fit(X_train, y_train)

Fitting 5 folds for each of 29 candidates, totalling 145 fits


In [177]:
print(f"Best parameters: {mod.best_params_}")
print(f"Best cross-validation accuracy: {mod.best_score_}")

Best parameters: {'Regressor': HistGradientBoostingRegressor(), 'Regressor__l2_regularization': 1.0, 'Regressor__learning_rate': 0.01, 'Regressor__max_depth': None, 'Regressor__max_iter': 1000}
Best cross-validation accuracy: -0.09323748138117231


In [179]:
y_pred = mod.predict(X_test)
mean_absolute_percentage_error = np.mean(np.abs(y_test - y_pred) / y_test) * 100
mean_absolute_percentage_error

8.164649164771303

In [180]:
from joblib import dump
dump(mod, "ames-housing.joblib")

['ames-housing.joblib']

## Results

In [None]:
# report model metrics

## Discussion

### Conclusion