# Generative Model Classification - Bivariate Gaussian

Our first generative model for Winery classification used just one feature. Now we use two features, modeling each class by a **bivariate Gaussian**.

### Distribution of two features from one of the wineries

Our goal is to plot the distribution of two features from a particular winery. We will use several helper functions for this. It is worth understanding each of these.

### Data

In [None]:
### %matplotlib inline

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# Useful module for dealing with the Gaussian density
from scipy.stats import norm, multivariate_normal 

# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

### Set plot defaults

In [None]:
from matplotlib.pyplot import rcParams

mpl.rc('figure', figsize=[8., 8.])
mpl.rc('axes.spines', left=False, top=False, right=False, bottom=False)

In [None]:
# Load data set
data = np.loadtxt('../../_data/wine.data.txt', delimiter=',')

In [None]:
data[:3, :]

### Features

In [None]:
# Features
featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']
feature_names = {f:i for i, f in enumerate(featurenames)}
feature_lookup = {i:f for i, f in enumerate(featurenames)}

### Set target and labels

 - the target is in column 0

In [None]:
data[:, 0]

In [None]:
y = data[:, 0].astype('int8')
y

In [None]:
np.bincount(y)

In [None]:
y -= 1

In [None]:
labelnames = ['Winery 1', 'Winery 2', 'Winery 3']
label_names = {f:i for i, f in enumerate(labelnames)}
label_lookup = {i:f for i, f in enumerate(labelnames)}

### Train-test split

In [None]:
X = data[:, 1:14]

In [None]:
np.random.seed(0)
perm = np.random.permutation(178)

# training set (trainx, trainy) of size 130/178
trainx = X[perm[0:130], :]
trainy = y[perm[0:130]]

# test set (testx, testy) of size 48/178
testx = X[perm[130:178], :]
testy = y[perm[130:178]]

### Fit a Gaussian to a dataset feature

 - function returns the mean and covariance matrix of the Gaussian.

In [None]:
def fit_gaussian(x, features):
    mu = np.mean(x[:, features], axis=0)
    sigma = np.cov(x[:, features], rowvar=0, bias=1)
    return mu, sigma

For example, let's look at the Gaussian we get for winery 1, using features 0 ('alcohol') and 6 ('flavanoids').

In [None]:
f1, f2, label = 0, 6, 1
mu, sigma = fit_gaussian(trainx[trainy==label, :], [f1, f2])
print("Mean: " + str(mu), "\n")
print("Covariance matrix:\n" + str(sigma))

We now define a function that will fit a Gaussian generative model to the three classes, restricted to a given list of features. The function returns:
* `mu`: the means of the Gaussians, one per row
* `covar`: covariance matrices of each of the Gaussians
* `pi`: list of three class weights summing to 1

In [None]:
# Assumes y takes on values 1,2,3
def fit_generative_model(x, y, features):
    n = len(label_lookup)        # number of classes
    d = len(features)            # number of features
    mu = np.zeros((n, d))        # list of means
    sigma = np.zeros((n, d, d))  # list of covariance matrices
    pi = np.zeros(n)             # list of class weights (PDF)
    
    for label in label_lookup:
        indices = (y==label)
        mu[label, :], sigma[label, :,:] = fit_gaussian(x[indices, :], features)
        pi[label] = float(sum(indices))/float(len(y))
    return mu, sigma, pi

### Helper functions for plotting

Next, we will construct a routine for displaying points sampled from a two-dimensional Gaussian, as well as a few contour lines. Part of doing this involves deciding what range to use for each axis. We begin with a little helper function that takes as input an array of numbers (values along a single feature) and returns the range in which these numbers lie.

In [None]:
# Find the range within which an array of numbers lie, with a little buffer
def find_range(x, margin=0.3):
    lower = min(x)
    upper = max(x)
    band = upper - lower
    lower = lower - margin * band
    upper = upper + margin * band
    return lower, upper

Next we define a routine that plots a few contour lines of a given two-dimensional Gaussian.
It takes as input:
* `mu`, `cov`: the parameters of the Gaussian
* `x1g`, `x2g`: the grid (along the two axes) at which the density is to be computed
* `col`: the color of the contour lines

In [None]:
def plot_contours(mu, sigma, x1g, x2g, col):
    rv = multivariate_normal(mean=mu, cov=sigma)
    z = np.zeros((len(x1g), len(x2g)))
    
    for i in range(len(x1g)):
        for j in range(len(x2g)):
            z[j,i] = rv.logpdf([x1g[i], x2g[j]])
            
    # Normalise contours
    sign, logdet = np.linalg.slogdet(sigma)
    normalizer = -0.5 * (2*np.log(6.28) + sign*logdet)
    
    for offset in range(1, 4):
        plt.contour(x1g, x2g, z, levels=[normalizer - offset], colors=col, linewidths=2.0, linestyles='solid')
#         plt.contour(x1g, x2g, z, colors=col, linestyles='solid')

### Bivariate contour and distribution plot

In [None]:
@interact(feature_1=feature_names, feature_2=feature_names, label=label_names)
def two_features_plot(feature_1, feature_2, label):
    
    f1, f2 = feature_1, feature_2
    
    if f1 == f2:
        print("Please choose different features for axis")
        return

    # Set up plot
    x1_lower, x1_upper = find_range(trainx[trainy==label, f1])
    x2_lower, x2_upper = find_range(trainx[trainy==label, f2])
    plt.xlim(x1_lower, x1_upper) # limit along x1-axis
    plt.ylim(x2_lower, x2_upper) # limit along x2-axis
#     plt.gca().set_aspect('equal', 'datalim')
    
    # Plot the training points along the two selected features
    plt.plot(trainx[trainy==label, f1], trainx[trainy==label, f2], 'ro')

    # Define a grid along each axis; the density will be computed at each grid point
    res = 200 # resolution
    x1g = np.linspace(x1_lower, x1_upper, res)
    x2g = np.linspace(x2_lower, x2_upper, res)

    # Now plot a few contour lines of the density
    mu, cov = fit_gaussian(trainx[trainy==label,:], [f1,f2])
    plot_contours(mu, cov, x1g, x2g, 'k')
    
    # Finally, display
    plt.xlabel(featurenames[f1], fontsize=14, color='red')
    plt.ylabel(featurenames[f2], fontsize=14, color='red')
    title = '{}'.format(label_lookup[label])
    plt.title(title, fontsize=14, color='blue')
    plt.show()

### Multi-class bivariate contour plot

In [None]:
@interact(feature_1=feature_names, feature_2=feature_names)
def three_class_plot(feature_1, feature_2):
    
    f1, f2 = feature_1, feature_2
    
    if f1 == f2:
        print("Please choose different features for axis")
        return  
    
    # Set up plot
    x1_lower, x1_upper = find_range(trainx[:,f1])
    x2_lower, x2_upper = find_range(trainx[:,f2])
    plt.xlim(x1_lower, x1_upper) # limit along x1-axis
    plt.ylim(x2_lower, x2_upper) # limit along x2-axis
#     plt.gca().set_aspect('equal', 'datalim')
    
    # Plot the training points along the two selected features
    colors = ['r', 'k', 'g']
    for label in label_lookup.keys():
        plt.plot(trainx[trainy==label, f1], trainx[trainy==label, f2], marker='o', ls='None', c=colors[label])

    # Define a grid along each axis; the density will be computed at each grid point
    res = 200 # resolution
    x1g = np.linspace(x1_lower, x1_upper, res)
    x2g = np.linspace(x2_lower, x2_upper, res)

    # Show the Gaussian fit to each class, using features f1,f2
    mu, covar, pi = fit_generative_model(trainx, trainy, [f1, f2])
    
    for label in label_lookup.keys():
        gmean = mu[label, :]
        gcov = covar[label, :,:]
        plot_contours(gmean, gcov, x1g, x2g, colors[label])

    # Finally, display
    plt.xlabel(featurenames[f1], fontsize=14, color='red')
    plt.ylabel(featurenames[f2], fontsize=14, color='red')
    plt.title('Wine data', fontsize=14, color='blue')
    plt.show()

### Predict labels for the test points

How well we can predict the class (1,2,3) based just on these two features?

We start with a testing procedure that is analogous to what we developed in the 1-d case.

In [None]:
# Now test the performance of a predictor based on a subset of features
@interact(feature_1=feature_names, feature_2=feature_names)
def test_model(feature_1, feature_2, print_out=True):
    
    f1, f2 = feature_1, feature_2
    
    if f1 == f2: # need f1 != f2
        print("Please choose different features for axis")
        return  
    features = [f1,f2]
    mu, covar, pi = fit_generative_model(trainx, trainy, features)
    
    score = np.zeros((len(testy), len(label_lookup)))
    
    for i in range(len(testy)):
        for label in label_lookup:
            # max(log(probability of class * probability of feature)) = max(log(Class Weights) + log(PDF))
            # highest probability => label
            score[i, label] = np.log(pi[label]) + multivariate_normal.logpdf(
                testx[i, features], mean=mu[label, :], cov=covar[label, :,:])
            
    predictions = np.argmax(score, axis=1)
    
    # Sum up errors
    errors = np.sum(predictions!=testy)
    if print_out:
        print('Features: {}, {}, test errors: {:.0f}/{}'.format(feature_lookup[f1], feature_lookup[f2], errors, score.shape[0]))
    return errors

### Best Feature pair

Different pairs of features yield different test errors.
* What is the smallest achievable test error?
* Which pair of features achieves this minimum test error?

*Make a note of your answers to these questions, as you will need to enter them as part of this week's assignment.*

In [None]:
r_min = 100
ftrs = None
for f1 in range(len(feature_lookup)):
    for f2 in range(len(feature_lookup)):
        if f1==f2: continue
        errors = test_model(f1, f2, print_out=False)
        if errors < r_min:
            r_min = min(r_min, errors)
            ftrs = feature_lookup[f1], feature_lookup[f2]
print('Features with lowest test error: {} & {}, test error: {}'.format(ftrs[0], ftrs[1], r_min))

### Interactive decision boundary on test set

In [None]:
@interact(feature_1=feature_names, feature_2=feature_names)
def show_decision_boundary(feature_1, feature_2, train_data_on=True, test_data_on=True):
    from matplotlib.colors import ListedColormap
    cm_ = ListedColormap(['r', 'k', 'g'])
    
    f1, f2 = feature_1, feature_2
    
    if f1 == f2: return
    
    # Fit Gaussian to each class
    mu, covar, pi = fit_generative_model(trainx, trainy, [f1, f2])
    
    # Set up dimensions of plot
    x1_lower, x1_upper = find_range(trainx[:, f1])
    x2_lower, x2_upper = find_range(trainx[:, f2])
    plt.xlim([x1_lower, x1_upper])
    plt.ylim([x2_lower, x2_upper])

    # Plot points in training set
    colors = ['r', 'k', 'g']
    for label in label_lookup:
        if train_data_on:
            plt.plot(trainx[trainy==label,f1], trainx[trainy==label,f2], marker='o', ls='None', c=colors[label])
        if test_data_on:
            plt.plot(testx[testy==label, f1], testx[testy==label, f2], marker='*', ls='None', c=colors[label])
        
    # Define a dense grid; every point in the grid will be classified according to the generative model
    res = 200
    x1g = np.linspace(x1_lower, x1_upper, res)
    x2g = np.linspace(x2_lower, x2_upper, res)

    # Declare Random Variables corresponding to each class density
    random_vars = {}
    for label in label_lookup:
        random_vars[label] = multivariate_normal(mean=mu[label, :], cov=covar[label, :, :])

    # Classify every point in the grid; these are stored in an array Z[]
    Z = np.zeros((len(x1g), len(x2g)))
    
    for i in range(len(x1g)):
        for j in range(len(x2g)):
            scores = []
            for label in label_lookup:
                scores.append(np.log(pi[label]) + random_vars[label].logpdf([x1g[i], x2g[j]]))
            Z[i,j] = np.argmax(scores) #+1

    # Test errors
    errors = test_model(f1, f2, print_out=False)
    
    # Plot the contour lines
    # contour: only contours, contourf: filled 
    plt.contourf(x1g, x2g, Z.T, 3, cmap=cm_, alpha=.3)
    
    # Finally, show the image
    plt.xlabel(feature_lookup[f1], fontsize=14, color='red')
    plt.ylabel(feature_lookup[f2], fontsize=14, color='red')
    plt.title('Decision boundary\ntest errors: {}/{}'.format(errors, len(testy)))
    plt.show()
