# Generative Model Classification - Univariate Gaussian

The **Wine** data set is the running example for our discussion of the *generative approach to classification*. 

The data can be downloaded from the UCI repository (https://archive.ics.uci.edu/ml/datasets/wine). It contains 178 labeled data points, each corresponding to a bottle of wine:
* The features (`x`): a 13-dimensional vector consisting of visual and chemical features for the bottle of wine
* The label (`y`): the winery from which the bottle came (1,2,3)

### Import

In [None]:
# Standard includes
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Useful module for dealing with the Gaussian density
from scipy.stats import norm, multivariate_normal

# installing packages for interactive graphs
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, IntSlider

### Data

In [None]:
!find ../.. | grep -i wine.data.txt

Next, we load the Wine data set. There are 178 data points, each with 13 features and a label (1,2,3).
We will divide these into a training set of 130 points and a test set of 48 points.

In [None]:
# 'wine.data.txt' needs to be in the same directory
data = np.loadtxt('../../_data/wine.data.txt', delimiter=',')
# Names of features
featurenames = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']

Fix a particular "random" permutation of the data, and use these to effect the training / test split.
We get four arrays:
* `trainx`: 130x13, the training points
* `trainy`: 130x1, labels of the training points
* `testx`: 48x13, the test points
* `testy`: 48x1, labels of the test points

In [None]:
# Split 178 instances into training set (trainx, trainy) of size 130 and test set (testx, testy) of size 48
# Also split data and labels
np.random.seed(0)
perm = np.random.permutation(178)

trainx = data[perm[0:130], 1:14]
trainy = data[perm[0:130], 0]

testx = data[perm[130:178], 1:14]
testy = data[perm[130:178], 0]

Let's see how many training points there are from each class.

In [None]:
sum(trainy==1), sum(trainy==2), sum(trainy==3)

### Label distribution

In [None]:
# modify this cell
sum(testy==1), sum(testy==2), sum(testy==3)
np.bincount(testy.astype('i'))
testy

## Feature distribution - Gaussian fit

In [None]:
@interact_manual(feature=IntSlider(0,0,12), label=IntSlider(1,1,3))
def density_plot(feature, label):
    plt.hist(trainx[trainy==label, feature], normed=True)
    
    # Get mean , variance and std. dev.
    mu = np.mean(trainx[trainy==label, feature]) # mean
    var = np.var(trainx[trainy==label, feature]) # variance
    std = np.sqrt(var)                           # standard deviation
    
    # Plot Gaussian ~N(mu, std)
    x_axis = np.linspace(mu - 3*std, mu + 3*std, 1000)
    plt.plot(x_axis, norm.pdf(x_axis, mu, std), 'r', lw=2)
    plt.title("Winery "+str(label) )
    plt.xlabel(featurenames[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.show()

### Standard Deviation per feature for given label

In [None]:
label_ = 1
np.set_printoptions(precision=3, suppress=True)
sorted([(featurenames[feature], np.std(trainx[trainy==label_, feature])) for feature in range(trainx.shape[1])], key=lambda x: x[1])

### Fit a Gaussian generative model to each class

fit a Gaussian generative model to the three classes, restricted to just a single feature.

- mu = means
- var = variances
- pi = class weights or PDF or Gaussian distribution

In [None]:
def fit_generative_model(x, y, feature, labels=[1,2,3]):
    n = len(labels)         # number of classes
    mu = np.zeros(n)        # list of means
    var = np.zeros(n)       # list of variances
    pi = np.zeros(n)        # list of class weights (PDF)
    for i, label in enumerate(labels):
        indices = (y==label)
        mu[i] = np.mean(x[indices, feature])
        var[i] = np.var(x[indices, feature])
        pi[i] = float(sum(indices))/float(len(y))
    return labels, mu, var, pi

In [None]:
feature = 0 # 'alcohol'
labels, mu, var, pi = fit_generative_model(trainx, trainy, feature)
labels, mu, var, pi

### Gaussian distribution for each of the three classes

 - Seperation of distributions is indicative for the features predictive power.
 - Good as feature importance/selection tool

In [None]:
@interact(feature=IntSlider(0, 0, 12))
def show_densities(feature):
    labels, mu, var, pi = fit_generative_model(trainx, trainy, feature)
    colors = ['r', 'k', 'g']
    
    for i, label in enumerate(labels):
        mn = mu[i]
        sd = np.sqrt(var[i])
        x_axis = np.linspace(mn - 3*sd, mn + 3*sd, 1000)
        plt.plot(x_axis, norm.pdf(x_axis, mn, sd), colors[i], label="class " + str(label))
    plt.xlabel(featurenames[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.legend()
    plt.show()

### Predict labels for the test set

How well can we predict the class (1,2,3) based just on one feature? The code below lets us find this out.

 - Not prone to overfitting.
 - Prone to difference in train and test distributions.
 

In [None]:
from collections import OrderedDict

In [None]:
@interact(feature=IntSlider(0, 0, 12))
def test_model(feature):
    labels, mu, var, pi = fit_generative_model(trainx, trainy, feature)
    sd = var**.5
    colors = ['r', 'k', 'g']
    
    n = len(labels)
    n_train = len(trainy) # Number of test points
    n_test = len(testy) # Number of test points
    train_score = np.zeros((n_train, n))
    test_score = np.zeros((n_test, n))
    
    # Train score
    for i in range(n_train):
        for j, label in enumerate(labels):
            # max(log(probability of class * probability of feature)) = max(log(Class Weights) + log(PDF))
            train_score[i, j] = np.log(pi[j]) + norm.logpdf(trainx[i, feature], mu[j], sd[j])
            x_axis = np.linspace(mu[j] - 3*sd[j], mu[j] + 3*sd[j], 1000)
            plt.plot(x_axis, norm.pdf(x_axis, mu[j], sd[j]), colors[j], label="class " + str(label))  # 
        
    train_pred = np.asarray(labels)[np.argmax(train_score, axis=1)]
    
    # Test score  
    for i in range(n_test):
        for j, label in enumerate(labels):
            plt.hist(testx[testy==label, feature], color=colors[j], alpha=.5, normed=True)
            
            test_score[i, j] = np.log(pi[j]) + norm.logpdf(testx[i, feature], mu[j], sd[j])
            x_axis = np.linspace(mu[j] - 3*sd[j], mu[j] + 3*sd[j], 1000)
            plt.plot(x_axis, norm.pdf(x_axis, mu[j], sd[j]), colors[j], label="class " + str(label))  # 
        
    test_pred = np.asarray(labels)[np.argmax(test_score, axis=1)]
    
    # Tally up score
    train_score = np.sum(train_pred==trainy)/n_train
    test_score = np.sum(test_pred==testy)/n_test
    title = 'Feature: {}\ntrain error: {:.3f}\ntest error: {:.3f}'.format(featurenames[feature], train_score, test_score)
    plt.xlabel(featurenames[feature], fontsize=14, color='red')
    plt.ylabel('Density', fontsize=14, color='red')
    plt.title(title)
    
    # Legend without repeat
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())

    plt.show();