In [1]:
from IPython.display import HTML,display
display(HTML("""
    <style>
        .container {
            font-size: 1.5rem;
        }
    </style>
"""))

# Outline

- [Necessary packages](#necessary_packages)
- [The implementation](#the_implementation)
- [Data preparation](#data_preparation)
- [Training](#training)
- [Testing & evaluation](#testing_and__evaluation)

## About the dataset

link : https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification

<a id="necessary_packages"></a>

## Necessary packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

<a id="the_implementation" ></a>

## The implementation

In [3]:
class MixteNaiveBayes:
    
    def __init__(self, categorical):
        self.model = None
        self.categorical = categorical
        self.classes = None
        
    def fit(self, X, Y):
        
        self.model = []
        self.classes, counts = np.unique(Y, return_counts = True)
        
        for i in range(X.shape[1]):
                        
            if i in self.categorical:
                p = self.__handle_categorical_attribute__(X[:,i], Y)
            else:
                p = self.__handle_numerical_attribute__(X[:,i], Y)
                
            self.model.append(p)
        
        probabilities = dict(zip(self.classes, counts / counts.sum()))
        self.model.append(probabilities)
        
            
    def predict(self, X):
        
        if self.model is None:
            raise Exception("fit was not called")
        
        y_hat = []
        
        for x in X:
            probailities = []
            for group in self.classes:
                p = 1
                for i in range(X.shape[1]):
                    if i in self.categorical:
                        p = p * self.model[i][group][x[i]]
                    else:
                        std = self.model[i][group]['std']
                        mean = self.model[i][group]['mean']
                        p = p * self.__gauss__(mean, std, x[i])
                p = p * self.model[-1][group]
                probailities.append(p)
            probailities = np.array(probailities)
            probailities = probailities / probailities.sum()
            prediction = self.classes[np.argmax(probailities)]
            y_hat.append(prediction)
            
        y_hat = np.array(y_hat)
        return y_hat
        
    def __gauss__(self, mean, std, x):
        return np.exp(-0.5 * ((x - mean) / std) ** 2) / (np.sqrt(2 * np.pi) * std)
            
    def __handle_categorical_attribute__(self, column, Y):
        
        probabilities = {}
        values = np.unique(column)
        
        for group in self.classes:
            mask = Y == group
            _Y = Y[mask]
            _column = column[mask]
            probabilities[group] = {}
            for value in values:
                probabilities[group][value] = _Y[_column == value].shape[0] / _Y.shape[0]
                
        return probabilities
    
    def __handle_numerical_attribute__(self, column, Y):
        
        probabilities = {}
        
        for group in self.classes:
            
            _column = column[Y == group]
            
            mean = np.mean(_column)
            std = np.std(_column, ddof=1)
            
            probabilities[group] = {
                'mean': mean,
                'std': std
            }
            
        return probabilities

<a id="data_preparation" ></a>

## Data preparation

### Load the data

In [4]:
path = os.path.join('data', 'train.csv')
data = pd.read_csv(path)

### Get familiar with the data

In [5]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
data.tail()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0
1999,510,1,2.0,1,5,1,45,0.9,168,6,...,483,754,3919,19,4,2,1,1,1,3


In [7]:
data.shape

(2000, 21)

### Check some basic information about the features

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

### Check some statistical properties about the dataset

In [9]:
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


### Check if the dataset is balanced

In [10]:
data['price_range'].value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

<a id="training" ></a>

## Training

In [20]:
def split_dataset(ds, frqs, target):
    ds = ds.sample(frac = 1)
    features = ds.columns[ds.columns != target]
    indices = (frqs * ds.shape[0]).astype(np.int64)
    train_set, test_set = np.split(ds, indices_or_sections = indices)
    X_train, Y_train = train_set[features], train_set[target]
    X_test, Y_test = test_set[features], test_set[target]
    return X_train.to_numpy(), Y_train.to_numpy(),X_test.to_numpy(), Y_test.to_numpy()

In [21]:
X_train, Y_train,X_test, Y_test = split_dataset(ds = data, frqs = np.array([0.8]), target = 'price_range')
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(1600, 20)
(1600,)
(400, 20)
(400,)


In [22]:
categorical = []
for i in range(X_train.shape[1]):
    if np.unique(X_train[:,i]).shape[0] <= 8:
        categorical.append(i)
categorical

[1, 3, 5, 9, 17, 18, 19]

In [23]:
model = MixteNaiveBayes(categorical = categorical)

In [24]:
model.fit(X_train, Y_train)

<a id="testing_and__evaluation" ></a>

## Testing and evaluation

In [25]:
y_hat = model.predict(X_train)

In [26]:
(y_hat == Y_train).mean()

0.803125

In [27]:
y_hat_test = model.predict(X_test)

In [28]:
(y_hat_test == Y_test).mean()

0.82