In [30]:
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('car_evaluation.csv')

In [36]:
data.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [60]:
X = data.drop("unacc", axis=1)
y = data["unacc"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [39]:
X_train

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low
107,vhigh,high,2,2,small,low
900,med,vhigh,3,4,small,med
1708,low,low,5more,2,big,high
705,high,med,4,2,med,med
678,high,med,3,2,med,med
...,...,...,...,...,...,...
1130,med,med,3,more,big,low
1294,med,low,5more,more,big,high
860,high,low,5more,more,big,low
1459,low,high,4,2,small,high


In [52]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probabilities = defaultdict(int)
        self.feature_value_probs_given_class = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    
    def fit(self, X, y):
        total_samples = len(y)
        unique_classes = y.unique()
        
        # class probabilities
        for cls in unique_classes:
            self.class_probabilities[cls] = (y[y == cls].count()) / total_samples
        
        # Calculate feature probabilities given class
        for cls in unique_classes:
            class_samples = X[y == cls]
            total_samples_in_class = len(class_samples)
            
            for feature in X.columns:
                for value in X[feature].unique():
                    value_count = class_samples[feature][class_samples[feature] == value].count()
                    self.feature_value_probs_given_class[cls][feature][value] = value_count / total_samples_in_class
    
    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            class_scores = {}
            for cls, class_prob in self.class_probabilities.items():
                likelihood = 1.0
                for feature, value in row.items():
                    likelihood *= self.feature_value_probs_given_class[cls][feature][value]
                class_scores[cls] = class_prob * likelihood
            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)
        return predictions

In [61]:
naive_bayes = NaiveBayesClassifier()
naive_bayes.fit(X_train, y_train)

In [62]:
y_pred = naive_bayes.predict(X_test)

In [63]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8458574181117534


In [66]:
naive_bayes.class_probabilities

defaultdict(int,
            {'unacc': 0.6928807947019867,
             'acc': 0.23096026490066227,
             'good': 0.03890728476821192,
             'vgood': 0.037251655629139076})

In [67]:
naive_bayes.feature_value_probs_given_class

defaultdict(<function __main__.NaiveBayesClassifier.__init__.<locals>.<lambda>()>,
            {'unacc': defaultdict(<function __main__.NaiveBayesClassifier.__init__.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'vhigh': defaultdict(int,
                                      {'high': 0.2712066905615293,
                                       'low': 0.21863799283154123,
                                       'vhigh': 0.2951015531660693,
                                       'med': 0.21505376344086022}),
                          'vhigh.1': defaultdict(int,
                                      {'low': 0.22341696535244923,
                                       'vhigh': 0.2986857825567503,
                                       'med': 0.22461170848267623,
                                       'high': 0.2532855436081243}),
                          '2': defaultdict(int,
                                      {'2': 0.2855436081242533,
                                  