## READING DATASET

In [22]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('datasets/adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### DROPPING unnecessary COLUMNS

In [3]:
col = ['race', 'sex', 'education.num','marital.status','relationship']
data.drop(columns=col,inplace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,occupation,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,?,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,Exec-managerial,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,?,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,Machine-op-inspct,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,Prof-specialty,0,3900,40,United-States,<=50K


### Handling MISSING VALUES BY ENCODING THEM

```
After one hot encoding if all the values are false that means it was a missing value
and the model will handle it as a separate category
```

In [9]:
y = data['income']
X = data.drop(columns=['income'])


x_encoded = pd.get_dummies(X, drop_first=True)
x_encoded.head()


Unnamed: 0,age,fnlwgt,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
0,90,77053,0,4356,40,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,82,132870,0,4356,18,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
2,66,186061,0,4356,40,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,54,140359,0,3900,40,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,41,264663,0,3900,40,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False


### SPLITTING DATASET

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x_encoded, y, test_size=0.3, random_state=42)
x_test,x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("x_val shape:", x_val.shape)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("y_val shape:", y_val.shape)

x_train shape: (22792, 83)
x_test shape: (4884, 83)
x_val shape: (4885, 83)
y_train shape: (22792,)
y_test shape: (4884,)
y_val shape: (4885,)


###  Analyzing class distributions and feature-target relationships

In [21]:
# Still working on the analysis of class distributions and feature-target relationships
# Asked the TA for clarification on this part
# Waiting for his response before proceeding further

## BUILDING THE MODEL

In [None]:
class naive_bayes:
    def __init__(self,alpha=1.0):
        self.alpha=alpha
        self.class_priors={}
        self.likelihoods={}
        self.classes=None
        
    def fit(self,x,y):
        class_count = y.value_counts().to_dict()
        self.classes = y.unique()
        
        
        for c in self.classes:
            num_samples_in_class = class_count.get(c, 0)
            # Calculating class priors  RULE : P(C) = (count(C) + alpha) / (total_samples + alpha * num_classes)
            self.class_priors[c] = (num_samples_in_class + self.alpha) / (len(y) + self.alpha * len(self.classes))
            
            # Calculating likelihoods  RULE : P(F|C) = (count(F,C) + alpha) / (count(C) + alpha * num_feature_values)
            x_c = x[y == c]
            feature_likelihoods = {}
            
            for feature in x.columns:
                # getting counts of feature values given class c
                feature_counts = x_c[feature].value_counts().to_dict()
                # getting number of unique values for the feature
                num_feature_values = x[feature].nunique()
                likelihoods = {}
                
                for feature_value in x[feature].unique():
                    likelihoods[feature_value] = (feature_counts.get(feature_value, 0) + self.alpha) / (num_samples_in_class + self.alpha * num_feature_values)
                feature_likelihoods[feature] = likelihoods
                
            self.likelihoods[c] = feature_likelihoods
            
            
            
            
    def predict(self,x):
        predictions = []
        
        for _, row in x.iterrows():
            class_probs = {}
            
            for c in self.classes:
                # Start with the class prior
                class_prob = self.class_priors[c]
                
                # Multiply by the likelihoods of each feature given the class
                for feature in x.columns:
                    feature_value = row[feature]
                    # adding a default value if the feature is never seen before by using laplace smoothing where count = 0
                    likelihood = self.likelihoods[c][feature].get(feature_value, 1 / len(self.likelihoods[c][feature]) )
                    # using log probabilities to avoid underflow
                    class_prob += np.log(likelihood)
                    
                class_probs[c] = class_prob
                
            # Choose the class with the highest probability
            predictions.append(max(class_probs, key=class_probs.get))
            
        return predictions

## Testing and Evaluating the Model

In [40]:
Model = naive_bayes(alpha=1.0)
Model.fit(x_train, y_train)
predictions = Model.predict(x_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Accuracy: 0.8046683046683046
Classification Report:
               precision    recall  f1-score   support

       <=50K       0.90      0.83      0.87      3692
        >50K       0.58      0.71      0.64      1192

    accuracy                           0.80      4884
   macro avg       0.74      0.77      0.75      4884
weighted avg       0.82      0.80      0.81      4884

Confusion Matrix:
 [[3080  612]
 [ 342  850]]


## Comparing model performance with sklearn's Naive Bayes 

In [41]:
from sklearn.naive_bayes import MultinomialNB
sklearn_model = MultinomialNB(alpha=1.0)
sklearn_model.fit(x_train, y_train)
sklearn_predictions = sklearn_model.predict(x_test)
print("Sklearn Model Accuracy:", accuracy_score(y_test, sklearn_predictions))
print("Sklearn Classification Report:\n", classification_report(y_test, sklearn_predictions))
print("Sklearn Confusion Matrix:\n", confusion_matrix(y_test, sklearn_predictions))

Sklearn Model Accuracy: 0.7800982800982801
Sklearn Classification Report:
               precision    recall  f1-score   support

       <=50K       0.80      0.95      0.87      3692
        >50K       0.63      0.24      0.35      1192

    accuracy                           0.78      4884
   macro avg       0.71      0.60      0.61      4884
weighted avg       0.75      0.78      0.74      4884

Sklearn Confusion Matrix:
 [[3520  172]
 [ 902  290]]
