Importing basic library's which we will be using in Naive Bayes Classifier.

In [1]:
import numpy as np
import scipy.stats as s
import pandas as pd
import matplotlib.pyplot as plt

Importing data file which is in csv format 

In [2]:
raw_data = pd.read_csv("winequality-white.csv")

In [3]:
raw_data.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [4]:
raw_data.shape

(4898, 1)

Clearly we can see that our data is mismatched i.e. it is all shuffeled(disorganised).

So before going towards our algorithm we have to make the data appear good.

We will organise the column names i.e. lables or features of are data.

In [5]:
column_names = raw_data.columns

In [6]:
column_names

Index(['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], dtype='object')

Above are the names of our features but still there are some unwanted things which we dont want in our data i.e (" , ;).

In [7]:
column_names_string = column_names[0]

In [8]:
column_names_string

'fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'

In [9]:
#First remove  " from our labels by using replace command.
column_names_string = column_names_string.replace('"','')

In [10]:
column_names_string

'fixed acidity;volatile acidity;citric acid;residual sugar;chlorides;free sulfur dioxide;total sulfur dioxide;density;pH;sulphates;alcohol;quality'

In [11]:
# now we will split are labels by using split command
column_names_string = column_names_string.split(sep=';')


In [12]:
column_names_string

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [13]:
#First remove  " from our labels by using replace command.
feature_values = list(map(lambda x: x.split(sep=';'),raw_data.iloc[:,0]))
#above expression will split the data of 0th column in all rows and make the list of our result


In [14]:
feature_values

[['7',
  '0.27',
  '0.36',
  '20.7',
  '0.045',
  '45',
  '170',
  '1.001',
  '3',
  '0.45',
  '8.8',
  '6'],
 ['6.3',
  '0.3',
  '0.34',
  '1.6',
  '0.049',
  '14',
  '132',
  '0.994',
  '3.3',
  '0.49',
  '9.5',
  '6'],
 ['8.1',
  '0.28',
  '0.4',
  '6.9',
  '0.05',
  '30',
  '97',
  '0.9951',
  '3.26',
  '0.44',
  '10.1',
  '6'],
 ['7.2',
  '0.23',
  '0.32',
  '8.5',
  '0.058',
  '47',
  '186',
  '0.9956',
  '3.19',
  '0.4',
  '9.9',
  '6'],
 ['7.2',
  '0.23',
  '0.32',
  '8.5',
  '0.058',
  '47',
  '186',
  '0.9956',
  '3.19',
  '0.4',
  '9.9',
  '6'],
 ['8.1',
  '0.28',
  '0.4',
  '6.9',
  '0.05',
  '30',
  '97',
  '0.9951',
  '3.26',
  '0.44',
  '10.1',
  '6'],
 ['6.2',
  '0.32',
  '0.16',
  '7',
  '0.045',
  '30',
  '136',
  '0.9949',
  '3.18',
  '0.47',
  '9.6',
  '6'],
 ['7',
  '0.27',
  '0.36',
  '20.7',
  '0.045',
  '45',
  '170',
  '1.001',
  '3',
  '0.45',
  '8.8',
  '6'],
 ['6.3',
  '0.3',
  '0.34',
  '1.6',
  '0.049',
  '14',
  '132',
  '0.994',
  '3.3',
  '0.49',
  '9.5

In [15]:
# now we will amke array of are values
feature_values = np.array(feature_values)

In [16]:
feature_values

array([['7', '0.27', '0.36', ..., '0.45', '8.8', '6'],
       ['6.3', '0.3', '0.34', ..., '0.49', '9.5', '6'],
       ['8.1', '0.28', '0.4', ..., '0.44', '10.1', '6'],
       ...,
       ['6.5', '0.24', '0.19', ..., '0.46', '9.4', '6'],
       ['5.5', '0.29', '0.3', ..., '0.38', '12.8', '7'],
       ['6', '0.21', '0.38', ..., '0.32', '11.8', '6']], dtype='<U16')

In [17]:
#now we will perform the same operation for the rest of our columns by making loop
#First we make the empty dictionary
D = {}

for i in range(0,12):
    
    D[column_names_string[i]] = feature_values[:,i]

In [18]:
D
#our data is almost ready to perform opertions.



{'fixed acidity': array(['7', '6.3', '8.1', ..., '6.5', '5.5', '6'], dtype='<U16'),
 'volatile acidity': array(['0.27', '0.3', '0.28', ..., '0.24', '0.29', '0.21'], dtype='<U16'),
 'citric acid': array(['0.36', '0.34', '0.4', ..., '0.19', '0.3', '0.38'], dtype='<U16'),
 'residual sugar': array(['20.7', '1.6', '6.9', ..., '1.2', '1.1', '0.8'], dtype='<U16'),
 'chlorides': array(['0.045', '0.049', '0.05', ..., '0.041', '0.022', '0.02'],
       dtype='<U16'),
 'free sulfur dioxide': array(['45', '14', '30', ..., '30', '20', '22'], dtype='<U16'),
 'total sulfur dioxide': array(['170', '132', '97', ..., '111', '110', '98'], dtype='<U16'),
 'density': array(['1.001', '0.994', '0.9951', ..., '0.99254', '0.98869', '0.98941'],
       dtype='<U16'),
 'pH': array(['3', '3.3', '3.26', ..., '2.99', '3.34', '3.26'], dtype='<U16'),
 'sulphates': array(['0.45', '0.49', '0.44', ..., '0.46', '0.38', '0.32'], dtype='<U16'),
 'alcohol': array(['8.8', '9.5', '10.1', ..., '9.4', '12.8', '11.8'], dtype='<U16

In [19]:
#lets make it a data frame and see how it looks
raw_data = pd.DataFrame(data=D)

In [20]:
raw_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [21]:
raw_data = raw_data.applymap(lambda x: float(x))

In [22]:
raw_data['quality'] = raw_data['quality'].apply(lambda x: int(x))

In [23]:
raw_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


Our data is ready to use in Naive Bayes Classifier

In [24]:
#lets see how many class we have in our data
number_of_unique_wine_scores = raw_data['quality'].unique()

In [25]:
number_of_unique_wine_scores

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [26]:
# clearly we can see that our data is multi class data.
#lets sort it in asscending order
number_of_unique_wine_scores = sorted(number_of_unique_wine_scores)

In [27]:
print(number_of_unique_wine_scores)

[3, 4, 5, 6, 7, 8, 9]


Lets seprate training data and testing data from our raw data

In [28]:
#training data = 75% of our raw data
training_data = raw_data.iloc[0:int(0.75*len(raw_data))]
#testing data = raw data - training_data
testing_data = raw_data.iloc[int(0.75*len(raw_data)):]

testing_data = pd.DataFrame(testing_data)

actual_class = testing_data['quality']

In [29]:
# we are making function to find out mean vector and covariance mattrix
def mean_vector_and_cov_mat(wine_score):
    
    wine_score_data = training_data[training_data['quality'] == wine_score]
    # we will drop class column from our training data
    dropped_wine_score_data = wine_score_data.drop(['quality'],axis=1)
    
    wine_score_data_array = np.array(dropped_wine_score_data)
    # below we are finding mean of features column's
    mean_vector = np.mean(wine_score_data_array,axis=0)
    #creating covariance matrix
    cov_mat = dropped_wine_score_data.cov()
    
    return [mean_vector,cov_mat]

In [30]:
natural_parameters = []

for i in range(0,7):
    
    natural_parameters.append(mean_vector_and_cov_mat(number_of_unique_wine_scores[i]))
   # our natural parameter is list which contains vector mean and covariance matrix of a feature . So there will be total 7 natural parameters. 

In [31]:
natural_parameters

[[array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
         5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 9.94860556e-01,
         3.17055556e+00, 4.62777778e-01, 1.03277778e+01]),
                        fixed acidity  volatile acidity  citric acid  \
  fixed acidity              3.137418          0.026106     0.044866   
  volatile acidity           0.026106          0.021464     0.002214   
  citric acid                0.044866          0.002214     0.006958   
  residual sugar             2.219624          0.054614    -0.077560   
  chlorides                  0.004618          0.004021     0.000914   
  free sulfur dioxide       -9.281209          0.105498    -2.211977   
  total sulfur dioxide      -1.270915          5.337778    -3.165065   
  density                    0.001965          0.000101    -0.000042   
  pH                        -0.278546         -0.006410    -0.003965   
  sulphates                 -0.000670          0.006849    -0.000578   
  a

In [32]:
#The above task can be acheived like below also without using loop
natural_parameters = list(map(lambda x: mean_vector_and_cov_mat(x),number_of_unique_wine_scores))

In [33]:
print(natural_parameters)

[[array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
       5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 9.94860556e-01,
       3.17055556e+00, 4.62777778e-01, 1.03277778e+01]),                       fixed acidity  volatile acidity  citric acid  \
fixed acidity              3.137418          0.026106     0.044866   
volatile acidity           0.026106          0.021464     0.002214   
citric acid                0.044866          0.002214     0.006958   
residual sugar             2.219624          0.054614    -0.077560   
chlorides                  0.004618          0.004021     0.000914   
free sulfur dioxide       -9.281209          0.105498    -2.211977   
total sulfur dioxide      -1.270915          5.337778    -3.165065   
density                    0.001965          0.000101    -0.000042   
pH                        -0.278546         -0.006410    -0.003965   
sulphates                 -0.000670          0.006849    -0.000578   
alcohol                   -0.

$ prior \ probability = \frac{length \ of\ x\ wine \ score \ data}{length\  of\ total\ training \ data}$

In [34]:
# now we will calculate prior probabilities of all the class's by using lambda function.

prior_class_probabilities = list(map(lambda wine_score: len(training_data[training_data['quality'] == wine_score])/len(training_data),
                                     number_of_unique_wine_scores))

In [35]:
prior_class_probabilities

[0.004900626191124422,
 0.03593792540157909,
 0.30057173972229784,
 0.429893819765859,
 0.186223795262728,
 0.04111080860332154,
 0.001361285053090117]

In [36]:
# zip function is used to combine two list of same length
# dict function is used make dictionary
D = dict(zip(number_of_unique_wine_scores,natural_parameters))

In [37]:
for k,p_cap in zip(D.keys(),prior_class_probabilities):
    
    D[k].append(p_cap)
    

this for loop is apending the prior probabilities  to the dictionary i.e now our dictionary looks like
    $ ( \mu , \sum , P(x) ) $

In [38]:
D[3]

[array([7.72777778e+00, 3.41388889e-01, 3.40555556e-01, 6.10277778e+00,
        5.43888889e-02, 4.08611111e+01, 1.56111111e+02, 9.94860556e-01,
        3.17055556e+00, 4.62777778e-01, 1.03277778e+01]),
                       fixed acidity  volatile acidity  citric acid  \
 fixed acidity              3.137418          0.026106     0.044866   
 volatile acidity           0.026106          0.021464     0.002214   
 citric acid                0.044866          0.002214     0.006958   
 residual sugar             2.219624          0.054614    -0.077560   
 chlorides                  0.004618          0.004021     0.000914   
 free sulfur dioxide       -9.281209          0.105498    -2.211977   
 total sulfur dioxide      -1.270915          5.337778    -3.165065   
 density                    0.001965          0.000101    -0.000042   
 pH                        -0.278546         -0.006410    -0.003965   
 sulphates                 -0.000670          0.006849    -0.000578   
 alcohol         

In [39]:
pooled_cov = 0

for i in [3,4,5,6,7,8,9]:
    
    pooled_cov += D[i][1]*(len(training_data[training_data['quality'] == i])-1)
    
pooled_cov = pooled_cov/(len(training_data)-7)

pooled_cov = pooled_cov + np.random.normal(loc=1,scale=0.25,size=(11,11))

In [40]:
def Naive_Bayes_Classifier_Result(wine_features):
    
    numerators = list(map(lambda wine_score: ((s.multivariate_normal.pdf(x=wine_features,
                                                          mean=D[wine_score][0],cov=pooled_cov))*D[wine_score][2]),D.keys()))
    
    numerators = np.array(numerators)
    
    posterior_class_probabilities = list(map(lambda x: numerators[x]/np.sum(numerators),[0,1,2,3,4,5,6]))
    
    return np.argmax(posterior_class_probabilities)

In [41]:
testing_data.drop(labels=['quality'],axis=1,inplace=True)

In [42]:
accuracy = 0

for i in range(0,len(testing_data)):
    
    predicted_class = Naive_Bayes_Classifier_Result(testing_data.iloc[i,:])
    
    if predicted_class == actual_class.iloc[i]:
        
        accuracy += 1

ValueError: the input matrix must be positive semidefinite

In [None]:
training_data.var()