In [248]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [249]:
data = pd.read_csv("../datasets/cars.csv")
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [250]:
atribute_names = ['buying','maint','doors','persons','lug_boot','safety']
objective_names = ['acceptability']

In [365]:
def train_NB(data:pd.DataFrame, x_names:list, y_names:list, k=1.0):
    groups = data.groupby(y_names)
    classes = list(groups.indices.keys())
    numClasses = len(classes)
    possible_values = {a: data[a].unique() for a in x_names}
    
    probabilities = dict()
    for c in classes:
                        # Condition prevents warnings when classes are (not) tuples
        g = groups.get_group(c if type(c) == tuple else (c,)) #For each class we calc
        probabilities[c] = {}
        probabilities[c][c] = int(groups.size()[c])/sum(groups.size()) #Probability of that class
        for a in x_names:
            probabilities[c][a] = (g[a].value_counts().reindex(possible_values[a],fill_value=0) + k)/(g.size +k*len(possible_values[a])) #Conditioned probabilities for that class and each attribute value
   
    return probabilities

def predict(probabilities: dict, example: pd.Series):
    preds = {}
    for c in list(probabilities.keys()):
        pred = np.log(probabilities[c][c])
        for a in list(example.keys()):
            pred += np.log(probabilities[c][a][example[a]])
        preds[c] = float(pred)
    return preds,max(preds,key=lambda x: preds[x])

In [366]:
probabilities = train_NB(data,atribute_names,objective_names)

In [367]:
example = data.iloc[0,0:-1]

In [368]:
predict(probabilities,example)        

({'acc': -31.240123697577744,
  'good': -34.480400510117896,
  'unacc': -18.229669892846225,
  'vgood': -37.65019558919418},
 'unacc')

In [380]:
class NaiveBayesClassifier:
    def __init__(self,k):
        self.k = k
        self.probabilities = {}
    
    def fit(self,dataf:pd.DataFrame, x_names:list, y_names:list):
        groups = dataf.groupby(y_names)
        self.classes = list(groups.indices.keys())
        self.numClasses = len(classes)
        self.possible_values = {a: dataf[a].unique() for a in x_names}
        
        for c in self.classes:
                            # Condition prevents warnings when classes are (not) tuples
            g = groups.get_group(c if type(c) == tuple else (c,)) #For each class we calc
            self.probabilities[c] = {}
            self.probabilities[c][c] = int(groups.size()[c])/sum(groups.size()) #Probability of that class
            for a in x_names:
               self.probabilities[c][a] = (g[a].value_counts().reindex(self.possible_values[a],fill_value=0) + self.k) \
                     /(g.size +self.k*len(self.possible_values[a]))
                    #Conditioned probabilities for that class and each attribute value

    def predict(self,example:pd.Series):
            preds = {}
            for c in list(probabilities.keys()):
                pred = np.log(probabilities[c][c]) #Probability of a class. We use logs to prevent underflow[log(a)+log(b)=log(ab)]
                for a in list(example.keys()):
                    pred += np.log(probabilities[c][a][example[a]]) #Conditional probability of tha atribute value for that class
                preds[c] = float(pred)
            return preds,max(preds,key=lambda x: preds[x])

In [381]:
classifier = NaiveBayesClassifier(k=1)

classifier.fit(dataf=data, x_names=atribute_names, y_names=objective_names)

In [382]:
classifier.predict(example)

({'acc': -31.240123697577744,
  'good': -34.480400510117896,
  'unacc': -18.229669892846225,
  'vgood': -37.65019558919418},
 'unacc')