# Naive Bayes

In [1]:
from typing import List, Dict
from enum import Enum
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import generate_random_data

In [2]:
class GirlType(Enum):
    WIFEY_MATERIAL = "Wifey Material"
    PROFESSIONAL_PARTNER = "Professional Partner"
    SEXUALLY_ENTICING = "Sexually Enticing"
    STRANGER = "Stranger"

In [34]:
df = generate_random_data.get_naive_bayes_data(1250)
df = df.sample(frac=1).reset_index(drop=True)
df.sample(10)

Unnamed: 0,Facial Attractiveness,Body Attractiveness,Professional Scale,Feminime Energy Scale,Attitude Scale,Girl Type
1221,Beautiful,Normal,Professional,Masculine,Nice,Professional Partner
958,Mid,Hot,Professional,Feminime,Nice,Professional Partner
375,Ugly,Not Ideal,Professional,Feminime,Normal,Stranger
1197,Mid,Hot,Professional,Masculine,Nice,Professional Partner
285,Mid,Not Ideal,Professional,Masculine,Nice,Professional Partner
704,Beautiful,Hot,Not Professional,Feminime,Normal,Sexually Enticing
1188,Mid,Normal,Professional,Masculine,Nice,Professional Partner
857,Ugly,Not Ideal,Professional,Masculine,Not Nice,Stranger
629,Mid,Not Ideal,Not Professional,Feminime,Normal,Stranger
509,Beautiful,Normal,Professional,Feminime,Nice,Wifey Material


In [35]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

X = df[df.columns[:-1]]
y = df[df.columns[-1]]
X_encoder = OrdinalEncoder()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sklearn_model = CategoricalNB()
sklearn_model.fit(X_encoder.fit_transform(X_train), y_train)

y_predict = sklearn_model.predict(X_encoder.fit_transform(X_test))
sum(y_test == y_predict) / len(y_test)

0.94

In [37]:
from collections import defaultdict
import math

class MyCategoricalNB:
    def __init__(self):
        self.n_dataset = 0
        self.feature_category_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 1))) 
        self.class_counts = defaultdict(lambda: 1)
        self.p_Xvalue_given_c = defaultdict(int)

    def fit(self, X, y):
        y = y.reset_index(drop=True)
        self.n_dataset = len(y)
        for i in range(self.n_dataset):
            for j, Xj in enumerate(X[i]):
                self.feature_category_counts[y[i]][j][Xj] += 1
            self.class_counts[y[i]] += 1
    
    def predict(self, X_test):
        y_predictions = []
        for X in X_test:
            for girl_type, features in self.feature_category_counts.items():
                for Xj, Xji in enumerate(X):
                    self.p_Xvalue_given_c[girl_type] += math.log(features[Xj][Xji] / self.class_counts[girl_type])
                self.p_Xvalue_given_c[girl_type] += math.log(self.class_counts[girl_type] / (self.n_dataset + len(self.class_counts)))
            prediction = max(self.p_Xvalue_given_c, key=self.p_Xvalue_given_c.get)
            y_predictions.append(prediction)
            self.p_Xvalue_given_c.clear()
        return y_predictions
    

my_model = MyCategoricalNB()
my_model.fit(X_encoder.fit_transform(X_train), y_train)
y_predict = my_model.predict(X_encoder.fit_transform(X_test))
print(sum(y_test == y_predict) / len(y_test), "Accuracy")
wrong_class_idx = y_test != y_predict 
wrong_class = X_test[wrong_class_idx]
wrong_class["Actual Girl Type"] = y_test[wrong_class_idx]
wrong_class["Predicted Girl Type"] = pd.Series(y_predict, index=wrong_class_idx.index).loc[wrong_class_idx]
wrong_class.sort_values("Actual Girl Type")

0.94 Accuracy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_class["Actual Girl Type"] = y_test[wrong_class_idx]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wrong_class["Predicted Girl Type"] = pd.Series(y_predict, index=wrong_class_idx.index).loc[wrong_class_idx]


Unnamed: 0,Facial Attractiveness,Body Attractiveness,Professional Scale,Feminime Energy Scale,Attitude Scale,Actual Girl Type,Predicted Girl Type
107,Mid,Hot,Professional,Feminime,Nice,Professional Partner,Wifey Material
530,Mid,Hot,Professional,Feminime,Normal,Professional Partner,Sexually Enticing
286,Beautiful,Hot,Not Professional,Feminime,Nice,Sexually Enticing,Wifey Material
620,Beautiful,Normal,Not Professional,Masculine,Nice,Sexually Enticing,Wifey Material
38,Beautiful,Hot,Professional,Feminime,Nice,Sexually Enticing,Wifey Material
779,Beautiful,Hot,Not Professional,Feminime,Nice,Sexually Enticing,Wifey Material
432,Beautiful,Hot,Not Professional,Feminime,Nice,Sexually Enticing,Wifey Material
1176,Beautiful,Hot,Professional,Feminime,Nice,Sexually Enticing,Wifey Material
628,Mid,Hot,Professional,Feminime,Nice,Sexually Enticing,Wifey Material
935,Ugly,Normal,Professional,Feminime,Normal,Stranger,Professional Partner
