Importing basic library's which we will be using in Naive Bayes Classifier.

In [56]:
import numpy as np
import scipy.stats as s
import pandas as pd
import matplotlib.pyplot as plt

Importing data file which is in csv format

In [57]:
raw_data = pd.read_csv("winequality-white.csv")

In [58]:
raw_data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [59]:
raw_data.shape

(4898, 1)

# As we can see that our data is mis-matched or not in processing format so first we will pre process our data

In [60]:
#lets process our feature variables
column_names = raw_data.columns

In [71]:
column_names

Index(['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], dtype='object')

In [72]:
column_names_string = column_names[0]

In [73]:
column_names_string

'fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'

In [74]:
#replace command will remove " from our column names
column_names_string = column_names_string.replace('"','')

In [75]:
column_names_string

'fixed acidity;volatile acidity;citric acid;residual sugar;chlorides;free sulfur dioxide;total sulfur dioxide;density;pH;sulphates;alcohol;quality'

In [76]:
#now we will separate our feature from each other using split command
column_names_string = column_names_string.split(sep=';')

In [67]:
column_names_string

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [77]:
#now our columns has been processed and now we will process our data
feature_values = list(map(lambda x: x.split(sep=';'),raw_data.iloc[:,0]))

In [78]:
feature_values

[['7',
  '0.27',
  '0.36',
  '20.7',
  '0.045',
  '45',
  '170',
  '1.001',
  '3',
  '0.45',
  '8.8',
  '6'],
 ['6.3',
  '0.3',
  '0.34',
  '1.6',
  '0.049',
  '14',
  '132',
  '0.994',
  '3.3',
  '0.49',
  '9.5',
  '6'],
 ['8.1',
  '0.28',
  '0.4',
  '6.9',
  '0.05',
  '30',
  '97',
  '0.9951',
  '3.26',
  '0.44',
  '10.1',
  '6'],
 ['7.2',
  '0.23',
  '0.32',
  '8.5',
  '0.058',
  '47',
  '186',
  '0.9956',
  '3.19',
  '0.4',
  '9.9',
  '6'],
 ['7.2',
  '0.23',
  '0.32',
  '8.5',
  '0.058',
  '47',
  '186',
  '0.9956',
  '3.19',
  '0.4',
  '9.9',
  '6'],
 ['8.1',
  '0.28',
  '0.4',
  '6.9',
  '0.05',
  '30',
  '97',
  '0.9951',
  '3.26',
  '0.44',
  '10.1',
  '6'],
 ['6.2',
  '0.32',
  '0.16',
  '7',
  '0.045',
  '30',
  '136',
  '0.9949',
  '3.18',
  '0.47',
  '9.6',
  '6'],
 ['7',
  '0.27',
  '0.36',
  '20.7',
  '0.045',
  '45',
  '170',
  '1.001',
  '3',
  '0.45',
  '8.8',
  '6'],
 ['6.3',
  '0.3',
  '0.34',
  '1.6',
  '0.049',
  '14',
  '132',
  '0.994',
  '3.3',
  '0.49',
  '9.5

In [79]:
#converting it into array
feature_values = np.array(feature_values)

In [80]:
feature_values

array([['7', '0.27', '0.36', ..., '0.45', '8.8', '6'],
       ['6.3', '0.3', '0.34', ..., '0.49', '9.5', '6'],
       ['8.1', '0.28', '0.4', ..., '0.44', '10.1', '6'],
       ...,
       ['6.5', '0.24', '0.19', ..., '0.46', '9.4', '6'],
       ['5.5', '0.29', '0.3', ..., '0.38', '12.8', '7'],
       ['6', '0.21', '0.38', ..., '0.32', '11.8', '6']], dtype='<U16')

In [82]:
#using dictionary we will combine our data to our feature:
D = {}

for i in range(0,12):
    
    D[column_names_string[i]] = feature_values[:,i]#(it means all rows and first column of our data)
    #it will pick feature one by one and combine it with a respective data

In [84]:
#lets see our data
D

{'fixed acidity': array(['7', '6.3', '8.1', ..., '6.5', '5.5', '6'], dtype='<U16'),
 'volatile acidity': array(['0.27', '0.3', '0.28', ..., '0.24', '0.29', '0.21'], dtype='<U16'),
 'citric acid': array(['0.36', '0.34', '0.4', ..., '0.19', '0.3', '0.38'], dtype='<U16'),
 'residual sugar': array(['20.7', '1.6', '6.9', ..., '1.2', '1.1', '0.8'], dtype='<U16'),
 'chlorides': array(['0.045', '0.049', '0.05', ..., '0.041', '0.022', '0.02'],
       dtype='<U16'),
 'free sulfur dioxide': array(['45', '14', '30', ..., '30', '20', '22'], dtype='<U16'),
 'total sulfur dioxide': array(['170', '132', '97', ..., '111', '110', '98'], dtype='<U16'),
 'density': array(['1.001', '0.994', '0.9951', ..., '0.99254', '0.98869', '0.98941'],
       dtype='<U16'),
 'pH': array(['3', '3.3', '3.26', ..., '2.99', '3.34', '3.26'], dtype='<U16'),
 'sulphates': array(['0.45', '0.49', '0.44', ..., '0.46', '0.38', '0.32'], dtype='<U16'),
 'alcohol': array(['8.8', '9.5', '10.1', ..., '9.4', '12.8', '11.8'], dtype='<U16

In [85]:
#lets convert it into data frame for further process
raw_data = pd.DataFrame(data=D)

In [86]:
raw_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [87]:
#using map lambda we will convert our data into float 
raw_data = raw_data.applymap(lambda x: float(x))

In [89]:
#we will keep our class in int format only
raw_data['quality'] = raw_data['quality'].apply(lambda x: int(x))

In [90]:
raw_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [91]:
#lets find out what is total number of class we have(wine score)
number_of_unique_wine_scores = raw_data['quality'].unique()

In [92]:
number_of_unique_wine_scores = sorted(number_of_unique_wine_scores)

In [93]:
print(number_of_unique_wine_scores)

[3, 4, 5, 6, 7, 8, 9]


# now our data is preprocessed and ready to use

now we will use right hand rule to seprate training and testing data from our raw data i.e 75% testing data 25% training data

In [94]:
#training data = 75% rawdata
training_data = raw_data.iloc[0:int(0.75*len(raw_data))]
#converting it into data frame
training_data = pd.DataFrame(training_data)
#testing data = raw data - training data
testing_data = raw_data.iloc[int(0.75*len(raw_data)):]

testing_data = pd.DataFrame(testing_data)
#now we have to store our class labels seprately
actual_class = testing_data['quality']

now we will remove features with less variance (PCA)

In [98]:
feature_variances_training_data = training_data.var()

In [99]:
feature_variances_training_data.sort_values(inplace= True)

In [100]:
feature_variances_training_data

density                    0.000009
chlorides                  0.000478
volatile acidity           0.010194
sulphates                  0.013025
citric acid                0.015314
pH                         0.023720
fixed acidity              0.719339
quality                    0.838438
alcohol                    1.437394
residual sugar            26.088071
free sulfur dioxide      282.940889
total sulfur dioxide    1896.459570
dtype: float64

As we can see above our density feature has lowest variance so we can drop it from our data.

Its totally upto us if we want to remove more feature or not depending upon the desired accuracy.

we can perform hit and trial by removing feature's and see its effect on accuracy.


In [101]:
testing_data.drop(labels=['quality'],axis=1,inplace=True)
testing_data = testing_data.drop(['density'],axis=1)
training_data = training_data.drop(['density'], axis=1)

In [104]:
#we will make function for our mean vector and covarinace mattrix.
# our function will give covmatrix  and mean vector as output and take wine score as input.
def mean_vector_and_cov_mat(wine_score):
    #wine score = class label
    wine_score_data = training_data[training_data['quality'] == wine_score]
    #drop class from our data
    dropped_wine_score_data = wine_score_data.drop(['quality'],axis=1)
    #convert it into array
    wine_score_data_array = np.array(dropped_wine_score_data)
    #calculating mean using numpy
    mean_vector = np.mean(wine_score_data_array,axis=0)
    #calculating covaraince mattrix 
    cov_mat = np.array(dropped_wine_score_data.cov())
    #output
    return [mean_vector,cov_mat]

In [106]:
#finding natural parameter
natural_parameters = []

for i in range(0,7):
    
    natural_parameters.append(mean_vector_and_cov_mat(number_of_unique_wine_scores[i]))

In [107]:
#The above task can be acheived like below also without using loop
natural_parameters = list(map(lambda x: mean_vector_and_cov_mat(x),number_of_unique_wine_scores))

In [108]:
print(natural_parameters)

[[array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
       5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 3.17055556e+00,
       4.62777778e-01, 1.03277778e+01]), array([[ 3.13741830e+00,  2.61062092e-02,  4.48660131e-02,
         2.21962418e+00,  4.61797386e-03, -9.28120915e+00,
        -1.27091503e+00, -2.78545752e-01, -6.69934641e-04,
        -9.19934641e-02],
       [ 2.61062092e-02,  2.14641340e-02,  2.21388889e-03,
         5.46135621e-02,  4.02089869e-03,  1.05498366e-01,
         5.33777778e+00, -6.40964052e-03,  6.84885621e-03,
        -6.83496732e-03],
       [ 4.48660131e-02,  2.21388889e-03,  6.95849673e-03,
        -7.75604575e-02,  9.14477124e-04, -2.21197712e+00,
        -3.16506536e+00, -3.96503268e-03, -5.78104575e-04,
         2.93366013e-02],
       [ 2.21962418e+00,  5.46135621e-02, -7.75604575e-02,
         2.63339624e+01,  7.53723856e-02, -1.05760621e+01,
         1.96845261e+02, -5.09531046e-01,  2.70815359e-01,
        -2.93905229e-01],
   

We know that bayes theorem is give by :

# $P(A/B) = \frac{P(B/A).P(A)}{P(B)} $



P(A) = Prior probabilty

P(B) = Total probabilty

P(B/A) = Conditional probabilty

P(A/B) = Posterior probability

In [109]:
#finding prior probabilty using map lambda
prior_class_probabilities = list(map(lambda wine_score: len(training_data[training_data['quality'] == wine_score])/len(training_data),
                                     number_of_unique_wine_scores))

In [110]:
prior_class_probabilities

[0.004900626191124422,
 0.03593792540157909,
 0.30057173972229784,
 0.429893819765859,
 0.186223795262728,
 0.04111080860332154,
 0.001361285053090117]

In [111]:
#dictionary = {key:value}
D = dict(zip(number_of_unique_wine_scores,natural_parameters))

In [112]:

for k,p_cap in zip(D.keys(),prior_class_probabilities):
    
    D[k].append(p_cap)

In [113]:
#lets see for our first wine score its natural parameters and prior probabilty
D[3]

[array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
        5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 3.17055556e+00,
        4.62777778e-01, 1.03277778e+01]),
 array([[ 3.13741830e+00,  2.61062092e-02,  4.48660131e-02,
          2.21962418e+00,  4.61797386e-03, -9.28120915e+00,
         -1.27091503e+00, -2.78545752e-01, -6.69934641e-04,
         -9.19934641e-02],
        [ 2.61062092e-02,  2.14641340e-02,  2.21388889e-03,
          5.46135621e-02,  4.02089869e-03,  1.05498366e-01,
          5.33777778e+00, -6.40964052e-03,  6.84885621e-03,
         -6.83496732e-03],
        [ 4.48660131e-02,  2.21388889e-03,  6.95849673e-03,
         -7.75604575e-02,  9.14477124e-04, -2.21197712e+00,
         -3.16506536e+00, -3.96503268e-03, -5.78104575e-04,
          2.93366013e-02],
        [ 2.21962418e+00,  5.46135621e-02, -7.75604575e-02,
          2.63339624e+01,  7.53723856e-02, -1.05760621e+01,
          1.96845261e+02, -5.09531046e-01,  2.70815359e-01,
         -2.9

{3: [array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
         5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 3.17055556e+00,
         4.62777778e-01, 1.03277778e+01]),
  array([[ 3.13741830e+00,  2.61062092e-02,  4.48660131e-02,
           2.21962418e+00,  4.61797386e-03, -9.28120915e+00,
          -1.27091503e+00, -2.78545752e-01, -6.69934641e-04,
          -9.19934641e-02],
         [ 2.61062092e-02,  2.14641340e-02,  2.21388889e-03,
           5.46135621e-02,  4.02089869e-03,  1.05498366e-01,
           5.33777778e+00, -6.40964052e-03,  6.84885621e-03,
          -6.83496732e-03],
         [ 4.48660131e-02,  2.21388889e-03,  6.95849673e-03,
          -7.75604575e-02,  9.14477124e-04, -2.21197712e+00,
          -3.16506536e+00, -3.96503268e-03, -5.78104575e-04,
           2.93366013e-02],
         [ 2.21962418e+00,  5.46135621e-02, -7.75604575e-02,
           2.63339624e+01,  7.53723856e-02, -1.05760621e+01,
           1.96845261e+02, -5.09531046e-01,  2.708153

In [118]:
D.keys()

dict_keys([3, 4, 5, 6, 7, 8, 9])

In [115]:
#now we will find the pooled covariance
pooled_cov = 0

alpha = 0.75

for i in [3,4,5,6,7,8,9]:
    
    pooled_cov += D[i][1]*(len(training_data[training_data['quality'] == i])-1)
    
pooled_cov = pooled_cov/(len(training_data)-7)


In [119]:
pooled_cov

array([[ 7.05829475e-01, -1.35088508e-03,  2.92018922e-02,
         3.30551949e-01, -3.86973015e-04, -9.78556896e-01,
         1.26962475e+00, -5.61563283e-02, -1.91800052e-03,
        -1.08121752e-02],
       [-1.35088508e-03,  9.47251360e-03, -1.54306047e-03,
         4.86880965e-02,  1.02724723e-04, -1.30536389e-01,
         3.81736495e-01, -9.66558957e-04, -4.21142958e-04,
         1.14753470e-02],
       [ 2.92018922e-02, -1.54306047e-03,  1.52734065e-02,
         7.76356092e-02,  2.40217478e-04,  1.66848862e-01,
         4.75039721e-01, -3.53610568e-03,  6.75781954e-04,
        -6.64401439e-03],
       [ 3.30551949e-01,  4.86880965e-02,  7.76356092e-02,
         2.53312947e+01,  6.79557947e-03,  2.61282241e+01,
         8.52936867e+01, -1.22208527e-01, -1.18552581e-02,
        -2.21781367e+00],
       [-3.86973015e-04,  1.02724723e-04,  2.40217478e-04,
         6.79557947e-03,  4.56327747e-04,  3.89137274e-02,
         1.64904225e-01, -1.14227148e-04,  1.10733374e-04,
        -6.

In [44]:
def Naive_Bayes_Classifier_Result(wine_features):
    #now we find the multivariate normal pdf 
    numerators = list(map(lambda wine_score: ((s.multivariate_normal.pdf(x=wine_features,
                                                          mean=D[wine_score][0],cov=pooled_cov))*D[wine_score][2]),D.keys()))
    
    numerators = np.array(numerators)
    
    posterior_class_probabilities = list(map(lambda x: numerators[x]/np.sum(numerators),[0,1,2,3,4,5,6]))
    
    return (np.argmax(posterior_class_probabilities)+3)

Now we have completely trained our model .

Now its time for testing our model.

In [120]:
testing_data.shape

(1225, 10)

In [121]:
accuracy = 0

for i in range(0,len(testing_data)):
    
    predicted_class = Naive_Bayes_Classifier_Result(testing_data.iloc[i,:])
    
    if predicted_class == actual_class.iloc[i]:
        
        accuracy += 1

In [122]:
accuracy

639

In [125]:
(639/1225)*100 

52.163265306122454

our model has a accuracy of 52.16 which is not so good nor to bad . And we can improve it by droping more coulmns with less variance.