In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def read_data(file_path):
    return pd.read_csv(file_path)

In [8]:
class NaiveBayes:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}
    
    def fit(self, X_train, y_train):
        classes, counts = np.unique(y_train, return_counts=True)
        total_samples = len(y_train)
        for c, count in zip(classes, counts):
            self.class_probs[c] = count / total_samples
        
     
        self.feature_probs = {}
        for c in classes:
            self.feature_probs[c] = {}
            for feature in X_train.columns:
                unique_values = X_train[feature].unique()
                self.feature_probs[c][feature] = {}
                for value in unique_values:
                    count = np.sum((X_train[feature] == value) & (y_train == c))
                    self.feature_probs[c][feature][value] = count / counts[c]
    
    def predict(self, X_test):
        predictions = []
        for _, row in X_test.iterrows():
            max_prob = -1
            predicted_class = None
            for c in self.class_probs:
                prob = self.class_probs[c]
                for feature, value in row.items():
                    if value in self.feature_probs[c][feature]:
                        prob *= self.feature_probs[c][feature][value]
                    else:
                        prob *= 0
                if prob > max_prob:
                    max_prob = prob
                    predicted_class = c
            predictions.append(predicted_class)
        return predictions



In [48]:
data = read_data("Social_Network_Ads.csv")
X = data.iloc[:,1:4]
y = data['Purchased']

In [49]:
X

Unnamed: 0,Gender,Age,EstimatedSalary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000
...,...,...,...
395,Female,46,41000
396,Male,51,23000
397,Female,50,20000
398,Male,36,33000


In [50]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


nb_model = NaiveBayes()
nb_model.fit(X_train, y_train)

In [52]:
y_pred = nb_model.predict(X_test)
print("predictions: ",y_pred)

predictions:  [1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]


In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.825
Precision: 0.8226190476190476
Recall: 0.825
F1 Score: 0.8215099715099715
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87        52
           1       0.79      0.68      0.73        28

    accuracy                           0.82        80
   macro avg       0.82      0.79      0.80        80
weighted avg       0.82      0.82      0.82        80



SyntaxError: invalid syntax (3672308396.py, line 4)