In [1]:
# All imports required
import os
import urllib3
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from scipy.stats import norm, multivariate_normal
from sklearn.model_selection import train_test_split

In [2]:
# Check if dataset is already present in project directory, else download from UCI repository
dataset = 'wine.data'
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
if not os.path.exists(dataset):
    http = urllib3.PoolManager()
    r = http.request('GET', url)
    with open(dataset, 'wb') as f:
        f.write(r.data)
        r.status

In [3]:
# Define dataset properties
target_col_index = 0
cols_to_ignore = None
column_names = ['Winery', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols',
                 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue',
                 'OD280/OD315 of diluted wines', 'Proline']

In [4]:
# Load dataset
df = pd.read_csv(dataset, header=None, names=column_names, na_values=['?'])
target_col_name = df.columns[target_col_index]
if cols_to_ignore: df.drop(cols_to_ignore, axis=1, inplace=True)
df.dropna(inplace=True)
df = df.reindex(sorted(df.columns), axis=1)
df.head()

Unnamed: 0,Alcalinity of ash,Alcohol,Ash,Color intensity,Flavanoids,Hue,Magnesium,Malic acid,Nonflavanoid phenols,OD280/OD315 of diluted wines,Proanthocyanins,Proline,Total phenols,Winery
0,15.6,14.23,2.43,5.64,3.06,1.04,127,1.71,0.28,3.92,2.29,1065,2.8,1
1,11.2,13.2,2.14,4.38,2.76,1.05,100,1.78,0.26,3.4,1.28,1050,2.65,1
2,18.6,13.16,2.67,5.68,3.24,1.03,101,2.36,0.3,3.17,2.81,1185,2.8,1
3,16.8,14.37,2.5,7.8,3.49,0.86,113,1.95,0.24,3.45,2.18,1480,3.85,1
4,21.0,13.24,2.87,4.32,2.69,1.04,118,2.59,0.39,2.93,1.82,735,2.8,1


In [5]:
# Split data and target into training sets and test sets
df_train, df_test = train_test_split(df)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
print('Dimensions of df_train: {}'.format(df_train.shape))
print('Dimensions of df_test: {}'.format(df_test.shape))

Dimensions of df_train: (133, 14)
Dimensions of df_test: (45, 14)


In [6]:
# List out all unique classes in training set target
classes = df_train[target_col_name].unique()
classes.sort()
classes

array([1, 2, 3])

In [7]:
# Fit a Gaussian generative model to the training data
mu = df_train.groupby(target_col_name).mean()
sigma = df_train.groupby(target_col_name).cov()
pi = df_train[target_col_name].value_counts(normalize=True).sort_index()

In [8]:
# Make predictions on test set
prediction_scores = pd.DataFrame()
for class_name in classes:
    prediction_scores[class_name] = np.log(pi.loc[class_name]) + multivariate_normal.logpdf(
        df_test.drop(target_col_name, axis=1), 
        mean=mu.loc[class_name], 
        cov=sigma.loc[class_name])
predictions = prediction_scores.idxmax(axis=1)

In [9]:
prediction_result = predictions == df_test[target_col_name]
total_predictions = prediction_result.size
correct_predictions = prediction_result.sum()
print('Total predictions made : {}'.format(total_predictions))
print('Correct predictions : {}'.format(correct_predictions))
print('Incorrect predictions : {}'.format(total_predictions - correct_predictions))
print('Prediction accuracy = {:0.2%}'.format(correct_predictions / total_predictions))

Total predictions made : 45
Correct predictions : 45
Incorrect predictions : 0
Prediction accuracy = 100.00%
