# Naive Bayes

In [33]:
train_path = "data_train.csv"
validation_path = "data_validation.csv"

import pandas as pd
import numpy as np

df_train = pd.read_csv(train_path)
df_validation = pd.read_csv(validation_path)

nominal_columns = [
    'blue',
    'dual_sim',
    'four_g',
    'three_g',
    'touch_screen',
    'wifi'
]


ratio_columns = [
    'battery_power',
    'clock_speed',
    'fc',
    'int_memory',
    'm_dep',
    'mobile_wt',
    'n_cores',
    'pc',
    'px_height',
    'px_width',
    'ram',
    'sc_h',
    'sc_w',
    'talk_time'
]


x_train = df_train.drop(['price_range'], axis=1)
y_train = df_train['price_range']

x_validation = df_validation.drop(['price_range'], axis=1)
y_validation = df_validation['price_range']

hei


In [34]:
from pandas import DataFrame, Series
import json
import math

class NaiveBayesClassifier:
    df_train: DataFrame

    def __init__(self, smoothing : bool = True):
        '''
            Parameters:
                smoothing (bool) : Apakah pelatihan data menggunakan smoothing
        '''
        self.is_smoothing = smoothing
        self.smoothing_factor = 2  # Laplace smoothing factor
        self.result_dict = {}
        
    # Gaussian Probability
    def _calculate_gaussian_pdf(self, feature, mean, std):
        '''
            Mengkalkulasi nilai gaussian berdasarkan fitur, mean, dan standar deviasi
        '''
        e = math.exp(-((feature - mean)**2)/(2*std**2))
        return (1/(math.sqrt(2*math.pi)*std)) * e
    
    def _calculate_probabilities(self):
        '''
            Mengkalkulasi probabilitas setiap fitur
        '''
        for feature_column in self.df_train.columns.drop('price_range'):
            if feature_column in nominal_columns:  # Replace with actual nominal columns
                grouped_data = self.df_train.groupby(['price_range', feature_column]).size().reset_index(name='occurrence')
                for _, row in grouped_data.iterrows():
                    price_range_val = str(row['price_range'])
                    feature_val = row[feature_column]
                    occurrence_val = row['occurrence']

                    if price_range_val not in self.result_dict:
                        self.result_dict[price_range_val] = {}
                    if feature_column not in self.result_dict[price_range_val]:
                        self.result_dict[price_range_val][feature_column] = {}

                    # Laplace Smoothing
                    self.result_dict[str(price_range_val)][feature_column][feature_val] = \
                        (occurrence_val + self.smoothing_factor) / (self.prior[int(price_range_val)] * len(self.df_train) + self.smoothing_factor * len(self.df_train[feature_column].unique()))
    
    def _convert_keys_to_string(self, dictionary):
        '''
            Fungsi utilitas untuk mengubah keyvalue integer ke string
        '''
        if isinstance(dictionary, dict):
            return {str(key): self._convert_keys_to_string(value) for key, value in dictionary.items()}
        else:
            return dictionary
    
    def _print_to_json(self):
        '''
            Menyimpan model yang telah dibuat ke dalam file berbentuk JSON
        '''
        self.means.to_json('means.json', indent=4)
        self.vars.to_json('variance.json', indent=4)
        self.prior.to_json('prior.json', indent=4)
        with open('result_dict.json', 'w') as json_file:
            json.dump(self._convert_keys_to_string(self.result_dict), json_file, indent=4)
        with open('nominal_unique_counts.json', 'w') as file:
                json.dump(self.nominal_unique_counts, file, indent=4)
    
    def fit(self, x: DataFrame, y: Series, save : bool):
        '''
        Melatih model yang diinstansiasi dengan data latihan yang diberikan
            Parameters:
                x (Dataframe) : Data latihan tanpa label
        '''

        self.df_train = x
        self.df_train['price_range'] = y
        self.means = self.df_train.groupby('price_range').mean()
        self.vars = self.df_train.groupby('price_range').var()
        self.prior = (self.df_train.groupby('price_range').count() / len(self.df_train)).iloc[:, 0]
        self.classes = np.unique(y)
        self.result_dict = {}
        self._calculate_probabilities()
        self.nominal_unique_counts = {col: x[col].nunique() for col in nominal_columns}
        
        #printing
        if save:
            self._print_to_json()

    def fit_from_json(self, means_file: str, variance_file: str, prior_file: str, result_dict_file: str, nominal_unique_counts_file:str):
        '''
            Mengkonfigurasi model berdasarkan masukan JSON File Path yang diterima
            Parameters:
                means_file (String) : Pathname untuk nilai mean
                variance_file (String) : Pathname untuk nilai string
                prior_file (String) : Pathname untuk probabilitas setiap target class
                result_dict_file (String) : Pathname untuk hasil probabilitas setiap kolom
                nominal_unique_counts (String) : Pathname untuk jumlah setiap kolom nominal
        '''
        self.means = pd.read_json(means_file)
        self.vars = pd.read_json(variance_file)
        self.prior = pd.read_json(prior_file, typ='series')
        with open(result_dict_file, 'r') as file:
            self.result_dict = json.load(file)
        with open(nominal_unique_counts_file, 'r') as file:
            self.nominal_unique_counts = json.load(file)
        self.classes = self.means.index.unique()
        

    def predict(self, to_predict: DataFrame):
        '''
            Memprediksi nilai label dari baris yang diberikan lalu mengembalikan hasilnya
            Parameters:
                to_predit (Dataframe) : Data validasi yang ingin diprediksi
            Returns:
                predictions (array of int) : Nilai label hasil prediksi untuk data yang diberikan, terurut setiap baris pada data
        '''
        predictions = []
        for i in to_predict.index:
            class_likelihood = []
            instance = to_predict.loc[i]
            for cls in self.classes:
                feature_likelihood = [self.prior.loc[cls]]
                for col in to_predict.columns:
                    data = instance[col]
                    if col in ratio_columns:
                        mean = self.means[col].loc[cls]
                        variance = self.vars[col].loc[cls]
                        likelihood = self._calculate_gaussian_pdf(data, mean, np.sqrt(variance))
                        feature_likelihood.append(likelihood) 
                    elif col in nominal_columns:
                        likelihood = self.result_dict[str(cls)][col].get(data, self.smoothing_factor / (self.prior[cls] * len(to_predict) + self.smoothing_factor * self.nominal_unique_counts[col]))
                        feature_likelihood.append(likelihood) 
                    
                # print(feature_likelihood)
                class_likelihood.append(np.prod(feature_likelihood))
            predictions.append(np.argmax(class_likelihood))
        return predictions

In [64]:
from sklearn.metrics import accuracy_score
selected_features = [
    'three_g',
    'battery_power',
    'fc',
    'int_memory',
    'mobile_wt',
    'px_height',
    'px_width',
    'ram',
    'sc_h',
    'sc_w',
]

df_train = df_train

model = NaiveBayesClassifier(True)
train = model.fit(x_train, y_train, True)
# train = model.fit_from_json('means.json', 'variance.json','prior.json','result_dict.json', 'nominal_unique_counts.json')
results = model.predict(x_validation)

accuracy = accuracy_score(y_validation, results)
print(accuracy)

0.7816666666666666
