# Classification on Wine Quality Dataset

## Visualization

## Preprocessing
- Gaussianization
- PCA
(But keep both reduced and not-reduced datasets to see which one performs better)
- Show effect of gaussianization on separability
## Methodology
- KFold using min DCF for selecting the
- Priors (0.1, 0.5, 0.9)
- Models used (MVG, LinReg, QuadLinReg, SVM(all kernels), GMM)
-
- Thresholds
- Take the best 2 models and perform further calibration (plot ROC, DCF, Bayes error plot)
- Model fusion

# Integrate pandas support into grid_cv

## Imports

In [1]:
from tiblib import load_wine
from tiblib.model_selection import grid_cv_multiprior
from tiblib.preprocessing import Gaussianizer, StandardScaler, PCA
from tiblib.classification import GaussianClassifier

## Gaussian Classifier

In [2]:
X_train, X_test, y_train, y_test = load_wine()

model = GaussianClassifier
hyperparams = {'tied':[False, True],
               'naive':[False, True]}

gaussianizer = Gaussianizer()
scaler = StandardScaler()
pca1 = PCA(n_dims=9)
pca2 = PCA(n_dims=5)
preprocessings = [
    [],
    [gaussianizer],
    [scaler],
    [scaler, pca1],
    [scaler, pca2],
    [gaussianizer, scaler, pca1],
    [gaussianizer, scaler, pca2],
]
prefix = 'gc'
pis = [0.1, 0.5, 0.9]
for pr in preprocessings:
    if len(pr) > 0:
        filename = '_'.join([str(p) for p in pr])
    else:
        filename = 'no_preproc'
    print(filename) # Prints current preprocessings in string form
    grid_cv_multiprior(X_train, y_train, pis=pis,
            preprocessing=pr,
            classifier=model, hyperparams=hyperparams, filename=f'results/results_{prefix}_{filename}.csv')

no_preproc
Showing results for pi = [0.1, 0.5, 0.9]
Full		& 0.797	& 0.313	& 0.839	\\
Naive		& 0.856	& 0.418	& 0.881	\\
Tied		& 0.818	& 0.338	& 0.741	\\
Naive, Tied		& 0.860	& 0.405	& 0.944	\\
Gaussianizer
Showing results for pi = [0.1, 0.5, 0.9]
Full		& 0.772	& 0.299	& 0.772	\\
Naive		& 0.863	& 0.445	& 0.863	\\
Tied		& 0.786	& 0.351	& 0.849	\\
Naive, Tied		& 0.866	& 0.443	& 0.945	\\
StandardScaler
Showing results for pi = [0.1, 0.5, 0.9]
Full		& 0.784	& 0.310	& 0.855	\\
Naive		& 0.867	& 0.419	& 0.926	\\
Tied		& 0.841	& 0.335	& 0.759	\\
Naive, Tied		& 0.868	& 0.409	& 0.947	\\
StandardScaler_PCA (d=9)
Showing results for pi = [0.1, 0.5, 0.9]
Full		& 0.829	& 0.317	& 0.810	\\
Naive		& 0.815	& 0.393	& 0.866	\\
Tied		& 0.835	& 0.342	& 0.746	\\
Naive, Tied		& 0.823	& 0.339	& 0.782	\\
StandardScaler_PCA (d=5)
Showing results for pi = [0.1, 0.5, 0.9]
Full		& 0.865	& 0.401	& 0.893	\\
Naive		& 0.862	& 0.434	& 0.898	\\
Tied		& 0.854	& 0.388	& 0.914	\\
Naive, Tied		& 0.854	& 0.387	& 0.923	\\
Gaussi

## Logistic Regression

In [3]:
from tiblib.classification import LogisticRegression

X_train, X_test, y_train, y_test = load_wine()

model = LogisticRegression
hyperparams = {'l':[1e-1, 1e-2, 1e-3, 1e-4]}

gaussianizer = Gaussianizer()
scaler = StandardScaler()
pca1 = PCA(n_dims=9)
pca2 = PCA(n_dims=5)
preprocessings = [
    [],
    [scaler],
    [scaler, pca1],
    [scaler, pca2]
]
prefix = 'lr'
pis = [0.1, 0.5, 0.9]
for pr in preprocessings:
    if len(pr) > 0:
        filename = '_'.join([str(p) for p in pr])
    else:
        filename = 'no_preproc'
    print(filename) # Prints current preprocessings in string form
    grid_cv_multiprior(X_train, y_train, pis=pis,
            preprocessing=pr,
            classifier=model, hyperparams=hyperparams, filename=f'results/results_{prefix}_{filename}.csv')

no_preproc
Showing results for pi = [0.1, 0.5, 0.9]
LogReg ($\lambda = 0.1$)		& 0.874	& 0.423	& 0.974	\\
LogReg ($\lambda = 0.01$)		& 0.844	& 0.400	& 0.816	\\
LogReg ($\lambda = 0.001$)		& 0.826	& 0.354	& 0.684	\\
LogReg ($\lambda = 0.0001$)		& 0.856	& 0.360	& 0.643	\\
StandardScaler
Showing results for pi = [0.1, 0.5, 0.9]
LogReg ($\lambda = 0.1$)		& 0.851	& 0.355	& 0.808	\\
LogReg ($\lambda = 0.01$)		& 0.845	& 0.351	& 0.666	\\
LogReg ($\lambda = 0.001$)		& 0.847	& 0.358	& 0.693	\\
LogReg ($\lambda = 0.0001$)		& 0.852	& 0.356	& 0.666	\\
StandardScaler_PCA (d=9)
Showing results for pi = [0.1, 0.5, 0.9]
LogReg ($\lambda = 0.1$)		& 0.842	& 0.356	& 0.779	\\
LogReg ($\lambda = 0.01$)		& 0.845	& 0.347	& 0.660	\\
LogReg ($\lambda = 0.001$)		& 0.854	& 0.361	& 0.670	\\
LogReg ($\lambda = 0.0001$)		& 0.840	& 0.352	& 0.661	\\
StandardScaler_PCA (d=5)
Showing results for pi = [0.1, 0.5, 0.9]
LogReg ($\lambda = 0.1$)		& 0.844	& 0.387	& 0.881	\\
LogReg ($\lambda = 0.01$)		& 0.856	& 0.383	& 0.847	\\

## Quadratic Logistic Regression

In [None]:
from tiblib.classification import QuadraticLogisticRegression

X_train, X_test, y_train, y_test = load_wine()

model = QuadraticLogisticRegression
hyperparams = {'l':[1e-1, 1e-2, 1e-3, 1e-4]}

gaussianizer = Gaussianizer()
scaler = StandardScaler()
pca1 = PCA(n_dims=9)
pca2 = PCA(n_dims=5)
preprocessings = [
    [],
    [scaler],
    [scaler, pca1],
    [scaler, pca2]
]
prefix = 'lr'
pis = [0.1, 0.5, 0.9]
for pr in preprocessings:
    if len(pr) > 0:
        filename = '_'.join([str(p) for p in pr])
    else:
        filename = 'no_preproc'
    print(filename) # Prints current preprocessings in string form
    grid_cv_multiprior(X_train, y_train, pis=pis,
            preprocessing=pr,
            classifier=model, hyperparams=hyperparams, filename=f'results/results_{prefix}_{filename}.csv')

no_preproc
Showing results for pi = [0.1, 0.5, 0.9]
QuadLogReg ($\lambda = 0.1$)		& 0.856	& 0.389	& 0.794	\\
QuadLogReg ($\lambda = 0.01$)		& 0.857	& 0.366	& 0.885	\\
QuadLogReg ($\lambda = 0.001$)		& 0.859	& 0.393	& 0.871	\\
QuadLogReg ($\lambda = 0.0001$)		& 0.827	& 0.366	& 0.861	\\
StandardScaler
Showing results for pi = [0.1, 0.5, 0.9]
QuadLogReg ($\lambda = 0.1$)		& 0.825	& 0.325	& 0.840	\\


## GMM

In [4]:
from tiblib.classification import GaussianMixtureClassifier

X_train, X_test, y_train, y_test = load_wine()

model = GaussianMixtureClassifier
hyperparams = {'tied':[False, True],
               'diag':[False, True],
               'n_components':[4,8,16],
               'alpha':[0.1, 0.5, 1]}
prefix = 'gmm'
pis = [0.1, 0.5, 0.9]
gaussianizer = Gaussianizer()
scaler = StandardScaler()
pca1 = PCA(n_dims=9)
pca2 = PCA(n_dims=5)
preprocessings = [
    [],
    [gaussianizer],
    [scaler],
]
for pr in preprocessings:
    if len(pr) > 0:
        filename = '_'.join([str(p) for p in pr])
    else:
        filename = 'no_preproc'
    print(filename) # Prints current preprocessings in string form
    grid_cv_multiprior(X_train, y_train, pis=pis,
            preprocessing=pr,
            classifier=model, hyperparams=hyperparams, filename=f'results/results_{prefix}_{filename}.csv')

no_preproc
Showing results for pi = [0.1, 0.5, 0.9]
GMM (4 components, $\alpha = 0.1$)		& 0.830	& 0.326	& 0.801	\\
GMM (4 components, $\alpha = 0.5$)		& 0.814	& 0.347	& 0.759	\\
GMM (4 components, $\alpha = 1$)		& 0.760	& 0.323	& 0.643	\\
GMM (8 components, $\alpha = 0.1$)		& 0.793	& 0.335	& 0.763	\\
GMM (8 components, $\alpha = 0.5$)		& 0.811	& 0.347	& 0.734	\\
GMM (8 components, $\alpha = 1$)		& 0.710	& 0.331	& 0.679	\\
GMM (16 components, $\alpha = 0.1$)		& 0.813	& 0.357	& 0.879	\\
GMM (16 components, $\alpha = 0.5$)		& 0.777	& 0.350	& 0.823	\\
GMM (16 components, $\alpha = 1$)		& 0.777	& 0.306	& 0.748	\\
GMM (Diag, 4 components, $\alpha = 0.1$)		& 0.860	& 0.421	& 0.842	\\


KeyboardInterrupt: 

## SVC

In [None]:
from tiblib.classification import SVC

X_train, X_test, y_train, y_test = load_wine()

model = SVC
hyperparams = {'C':[1e-1, 1e-2, 1e-3, 1e-4],
               'kernel': ['linear', 'poly', 'radial']}
prefix = 'svm'
pis = [0.1, 0.5, 0.9]
gaussianizer = Gaussianizer()
scaler = StandardScaler()
pca1 = PCA(n_dims=9)
pca2 = PCA(n_dims=5)
preprocessings = [
    [],
    [gaussianizer],
    [scaler],
    [scaler, pca1],
    [scaler, pca2]
]
for pr in preprocessings:
    if len(pr) > 0:
        filename = '_'.join([str(p) for p in pr])
    else:
        filename = 'no_preproc'
    print(filename) # Prints current preprocessings in string form
    grid_cv_multiprior(X_train, y_train, pis=pis,
            preprocessing=pr,
            classifier=model, hyperparams=hyperparams, filename=f'results/results_{prefix}_{filename}.csv')