In [5]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [6]:
# Get Dataset
wine_quality = fetch_ucirepo(id=186)

# Load data into pd dataframe
df = wine_quality.data.original
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [7]:
X = wine_quality.data.features 
y = wine_quality.data.targets 

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y.values.ravel(),test_size=0.2,random_state=42)

### Decision Trees

In [8]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'criterion': ['gini', 'entropy'],
    'max_features': ['log2', 'sqrt'],
}

rf = RandomForestClassifier(n_jobs=-1, random_state=42)
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

rf_grid.fit(X_train, y_train)
print("Best parameters found: ", rf_grid.best_params_)
print("Best cross-validation score: ", rf_grid.best_score_)
# Predict on the test set
y_pred = rf_grid.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: ", accuracy)

Fitting 5 folds for each of 48 candidates, totalling 240 fits




[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   0.2s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=10; total time=   0.3s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   0.2s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=10; total time=   0.2s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=50; total time=   0.3s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=50; total time=   0.3s
[CV] END criterion=gini, max_depth=None,

### SVM

In [9]:
from sklearn.svm import SVC
import os
import pickle
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


# Define base parameter grid
param_grid = {
    'svc__C': [.1, 1, 10, 100],
    'svc__gamma': [.001, .01, .1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf'],
    'svc__class_weight': [None, 'balanced'],
}

# Define two pipelines: with and without PCA
pipelines = {
    'without_pca': Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(cache_size=32768))
    ]),
    'with_pca': Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=0.95)),
        ('svc', SVC(cache_size=32768))
    ])
}

results = {}

# Loop over both pipelines
for name, pipeline in pipelines.items():
    model_filename = f'grid_search_{name}.pkl'
    
    if os.path.exists(model_filename):
        with open(model_filename, 'rb') as file:
            grid = pickle.load(file)
        print(f"Loaded cached grid for: {name}")
    else:
        print(f"Running grid search for: {name}...")
        grid = GridSearchCV(pipeline, param_grid, cv=4, verbose=2, n_jobs=-1)
        grid.fit(X_train, y_train)
        with open(model_filename, 'wb') as file:
            pickle.dump(grid, file)

    # Evaluate best model
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    results[name] = {
        'best_params': grid.best_params_,
        'cv_score': grid.best_score_,
        'test_score': test_acc
    }

# Display side-by-side comparison
comparison_df = pd.DataFrame(results).T
print("\nSVM with and without PCA Comparison:")
print(comparison_df)

Running grid search for: without_pca...
Fitting 4 folds for each of 96 candidates, totalling 384 fits
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.01, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.01, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.001, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.01, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.1, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.001, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.001, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.01, svc__kernel=linear; total time=   0.3s
[CV] END svc__C=0.1, svc__class_weight=None, svc__gamma=0.1, svc__kernel=linear; total t

### Neural Network