In [4]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import norm, multivariate_normal
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import time

In [20]:
def fit_gaussian_ND(features_2_build, train_set, train_labels, feature_names):

    mu_xy_all = list()
    cov_xy_all = list()
    pi_lab_all = list()

    for kindx in range( len(np.unique(train_labels)) ):
        data_2_use =  train_set[train_labels == kindx, :]
        mu_xy = np.mean(data_2_use[:, features_2_build ], axis = 0 )
        cov_xy = np.cov( data_2_use[:, features_2_build ], rowvar=0, bias=1)
        pi_lab = np.sum(train_labels == kindx)/len(train_labels)
        mu_xy_all.append( mu_xy )
        cov_xy_all.append( cov_xy )
        pi_lab_all.append( pi_lab )
        p_xy = multivariate_normal(mean = mu_xy, cov = cov_xy)

    return mu_xy_all, cov_xy_all, pi_lab_all

In [21]:
def evaluate_gaussian_ND(features_2_build, train_set, train_labels, feature_names, test_set, test_labels):
    mu_xy_all, cov_xy_all, pi_lab_all = fit_gaussian_ND(features_2_build, train_set, train_labels, feature_names)
    
    predicted_features = np.zeros( test_set.shape[0] )
    for j in range( test_set.shape[0] ):
        datapoint_2_test = test_set[ j, : ]
        score = np.zeros( len(mu_xy_all) )
        for k in range( len(mu_xy_all) ):
            score[k] = np.log( pi_lab_all[k] ) + multivariate_normal.logpdf( test_set[j, features_2_build], mean = mu_xy_all[k], cov=cov_xy_all[k] ) 
        
        predicted_features[j] = np.argmax( score ) 

    errors = np.sum( predicted_features != test_labels ) 

    print('Test error for features: ', features_2_build, ' is ', errors, '/', test_set.shape[0])
    return predicted_features

In [7]:
wine_data = np.loadtxt('wine.txt', delimiter=',')
feature_names = ['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash','Magnesium', 'Total phenols', 
                'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 
                'OD280/OD315 of diluted wines', 'Proline']

wine_labels = wine_data[:, 0]-1
wine_data = wine_data[:, 1 :]
wine_data.shape

(178, 13)

In [8]:
train_set, test_set, train_labels, test_labels = train_test_split(wine_data, wine_labels, test_size = 0.3, random_state = 42)

print("Train set: ", train_set.shape, " Test set: ", test_set.shape)

Train set:  (124, 13)  Test set:  (54, 13)


In [13]:
features_2_build = list(range(len(feature_names)))
mu_xy_all, cov_xy_all, pi_lab_all = fit_gaussian_ND(features_2_build, train_set, train_labels, feature_names)


In [22]:
predicted_features =  evaluate_gaussian_ND(features_2_build, train_set, train_labels, feature_names, test_set, test_labels)

Test error for features:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]  is  1 / 54


In [24]:
letter_data = pd.DataFrame.from_csv('letter_recognition.txt', index_col=None)
letter_labels = letter_data['lettr']
letter_data = letter_data.drop(['lettr'], axis=1)

letter_data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [25]:
encoder = LabelEncoder()
categorical_labels = letter_labels
labels_encoded = encoder.fit_transform(categorical_labels)
print(np.unique(labels_encoded))
print(encoder.classes_)
letter_labels = labels_encoded

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']


In [26]:
train_set, test_set, train_labels, test_labels = train_test_split(letter_data, letter_labels, test_size = 0.2, random_state = 6969)

print('Training set: ', train_set.shape, ' Test set: ', test_set.shape)

Training set:  (16000, 16)  Test set:  (4000, 16)


In [30]:
train_numpy_set = np.array(train_set)
train_numpy_labels = np.array(train_labels)
test_numpy_set = np.array(test_set)
test_numpy_labels = np.array(test_labels)

features_2_build = list(range( len(list(train_set.keys())) ))
feature_names = list(train_set.keys())

predicted_features = evaluate_gaussian_ND(features_2_build, train_numpy_set, train_numpy_labels, feature_names, test_numpy_set, test_numpy_labels)

Test error for features:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]  is  467 / 4000


In [29]:
features_2_build

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])