In [1]:
# All imports required
import os
import urllib3
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from scipy.stats import norm, multivariate_normal
from sklearn.model_selection import train_test_split

In [2]:
# Check if dataset is already present in project directory, else download from UCI repository
if not os.path.exists('wine.data'):
    http = urllib3.PoolManager()
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
    r = http.request('GET', url)
    with open('wine.data', 'wb') as f:
        f.write(r.data)
        r.status

In [3]:
# Load dataset
arr = np.loadtxt('wine.data', delimiter=',')
arr[1:5]

array([[1.000e+00, 1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02,
        2.650e+00, 2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00,
        3.400e+00, 1.050e+03],
       [1.000e+00, 1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02,
        2.800e+00, 3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00,
        3.170e+00, 1.185e+03],
       [1.000e+00, 1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02,
        3.850e+00, 3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01,
        3.450e+00, 1.480e+03],
       [1.000e+00, 1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02,
        2.800e+00, 2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00,
        2.930e+00, 7.350e+02]])

In [4]:
# Read array as a dataframe
feature_names = ['Winery', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
                 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue',
                 'OD280/OD315 of diluted wines', 'Proline']
df = pd.DataFrame(arr, columns=feature_names)
df.describe()

Unnamed: 0,Winery,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [5]:
# Split data and target into training sets and test sets
X, y = arr[:,1:], arr[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y)
print('Dimensions of X_train: {}'.format(X_train.shape))
print('Dimensions of X_test: {}'.format(X_test.shape))
print('Dimensions of y_train: {}'.format(y_train.shape))
print('Dimensions of y_test: {}'.format(y_test.shape))

Dimensions of X_train: (133, 13)
Dimensions of X_test: (45, 13)
Dimensions of y_train: (133,)
Dimensions of y_test: (45,)


In [6]:
# Fit a Gaussian generative model to the training data
classes = 3
features = X_train.shape[1]
mu = np.zeros((classes, features))
sigma = np.zeros((classes, features, features))
pi = np.zeros((classes))
for label in range(classes):
    indexes = y_train == label + 1
    mu[label] = np.mean(X_train[indexes], axis=0)
    sigma[label] = np.cov(X_train[indexes], rowvar=False, bias=True)
    pi[label] = np.mean(indexes)
pi.shape

(3,)

In [7]:
# Make predictions on test set
prediction_scores = np.zeros((y_test.size, 3))
for row_index in range(y_test.size):
    for class_index in range(classes):
        prediction_scores[row_index, class_index] = \
        np.log(pi[class_index]) + multivariate_normal.logpdf(X_test[row_index], mean=mu[class_index], 
                                                             cov=sigma[class_index])
predictions = prediction_scores.argmax(axis=1) + 1

In [8]:
# Calculate prediction accuracy
np.mean(predictions == y_test) * 100

100.0