# Generative Model Classification - Multivariate Gaussian

In this notebook, we return to winery classification, using the full set of 13 features.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

# Module for dealing with the Gaussian density
from scipy.stats import norm, multivariate_normal 

### Data 

In [None]:
data = np.loadtxt('../../_data/wine.data.txt', delimiter=',')

In [None]:
featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']

### Split samples

 - split 178 instances into:
   - training set (trainx, trainy) of size 130
   - test set (testx, testy) of size 48

In [None]:
np.random.seed(0)
perm = np.random.permutation(178)
trainx = data[perm[0:130], 1:14]
trainy = data[perm[0:130], 0]
testx = data[perm[130:178], 1:14]
testy = data[perm[130:178], 0]

### Gaussian generative model

We now define a function that fits a Gaussian generative model to the data.
For each class (`j=1,2,3`), we have:
* `pi[j]`: the class weight
* `mu[j,:]`: the mean, a 13-dimensional vector
* `sigma[j,:,:]`: the 13x13 covariance matrix

This means that `pi` is a 4x1 array (Python arrays are indexed starting at zero, and we aren't using `j=0`), `mu` is a 4x13 array and `sigma` is a 4x13x13 array.

In [None]:
def fit_generative_model(x, y, labels=[1,2,3]):
    n = len(labels)  # labels 1,2,...,k
    d = (x.shape)[1]  # number of features
    mu = np.zeros((n, d))
    sigma = np.zeros((n, d, d))
    pi = np.zeros(n)
    
    for i, label in enumerate(labels):
        indices = (y==label)
        mu[i] = np.mean(x[indices, :], axis=0)
        sigma[i] = np.cov(x[indices, :], rowvar=0, bias=1)
        pi[i] = float(sum(indices))/float(len(y))
    return mu, sigma, pi, labels

In [None]:
# Fit a Gaussian generative model to the training data
mu, sigma, pi, labels = fit_generative_model(trainx, trainy)

### Make predictions on the test set

In [None]:
# Now test the performance of a predictor based on a subset of features
def test_model(mu, sigma, pi, labels, features, tx, ty):
    score = np.zeros((len(ty), len(labels)))
    
    for i in range(len(ty)):
        for j, label in enumerate(labels):
            # max(log(probability of class * probability of feature)) = max(log(Class Weights) + log(PDF))
            score[i, j] = np.log(pi[j]) + multivariate_normal.logpdf(
                tx[i, features], mean=mu[j, features], cov=sigma[:,features,:][:, :,features][j,:,:])
    predictions = np.asarray(labels)[np.argmax(score, axis=1)]
    
    # Tally up score
    errors = np.sum(predictions!=ty)
    return features, errors

### Best Feature set

In [None]:
min_error = 100
min_ftrs = []
best_feature = 6
labels = [1,2,3]
for i in range(13):
    for j in range(13):
        k = best_feature
        if j == i: continue
        if i in [k,]: continue 
        if j in [i, k]: continue
        features, errors = test_model(mu, sigma, pi, labels, [i,j,k], testx, testy)
        if errors < min_error:
            min_error = min(min_error, errors)
            min_ftrs = features
#             print(features, min_error)
        if errors == 0: break
            
print('Feature set with lowest test error: {}[{}], {}[{}], {}[{}]\ntest errors: {}'.format(
    featurenames[min_ftrs[0]], min_ftrs[0], featurenames[min_ftrs[1]], min_ftrs[1], featurenames[min_ftrs[2]], min_ftrs[2], min_error))