## Evaluation Section:

### Built a Naive bayes classifier from scratch to classify whether someone will attempt to evade taxes based on the following features:

#### Refund: Categorical feature (Yes/No)
#### Marital Status: Categorical feature (Single/Married/Divorced)
#### Taxable Income: Continuous feature
#### Evade: Target class (Yes/No)

#### Assume categorical features follow categorical distribution, and continuous features follow a Gaussian distribution.

In [None]:
import pandas as pd
import numpy as np
from math import sqrt, pi, exp
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

### 1. Prepare the dataset

In [None]:
data = {
    'Refund': ['Yes', 'No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No'],
    'Marital Status': ['Single', 'Married', 'Single', 'Married', 'Divorced', 'Married', 'Divorced', 'Single', 'Single', 'Married'],
    'Taxable Income': [125000, 100000, 70000, 120000, 95000, 60000, 130000, 75000, 115000, 90000],
    'Evade': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}

# Convert to a DataFrame
df = pd.DataFrame(data)
print(df)

  Refund Marital Status  Taxable Income Evade
0    Yes         Single          125000    No
1     No        Married          100000    No
2     No         Single           70000    No
3    Yes        Married          120000    No
4     No       Divorced           95000   Yes
5     No        Married           60000   Yes
6    Yes       Divorced          130000    No
7     No         Single           75000   Yes
8    Yes         Single          115000    No
9     No        Married           90000   Yes


### 2. Calculatating Prior Probabilities and rest of the steps

In [None]:
# Count of samples for each class in Evade
count_yes = len(df[df['Evade'] == 'Yes'])
count_no = len(df[df['Evade'] == 'No'])

# Total number of samples
total_samples = len(df)

# Calculate priors
prior_yes = count_yes / total_samples
prior_no = count_no / total_samples

print(f"P(Evade=Yes): {prior_yes}")
print(f"P(Evade=No): {prior_no}")


P(Evade=Yes): 0.4
P(Evade=No): 0.6


likelihood calculation and prediction


In [None]:
def likelihood(feature, value, target_class, df, target_col):
    categorical_features = ['Refund', 'Marital Status']

    if feature in categorical_features:

        subset = df[df[target_col] == target_class]
        numerator = len(subset[subset[feature] == value]) + 1  # smoothing
        denominator = len(subset) + len(df[feature].unique())
        return numerator / denominator
    else:

        subset = df[df[target_col] == target_class]
        mean = subset[feature].mean()
        std = subset[feature].std()
        exponent = np.exp(-((value - mean) ** 2) / (2 * std ** 2))
        return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

def predict_tax_evasion(refund, marital_status, taxable_income):
    yes_likelihood = (likelihood('Refund', refund, 'Yes', df, 'Evade') *
                      likelihood('Marital Status', marital_status, 'Yes', df, 'Evade') *
                      likelihood('Taxable Income', taxable_income, 'Yes', df, 'Evade'))

    no_likelihood = (likelihood('Refund', refund, 'No', df, 'Evade') *
                     likelihood('Marital Status', marital_status, 'No', df, 'Evade') *
                     likelihood('Taxable Income', taxable_income, 'No', df, 'Evade'))

    posterior_yes = prior_yes * yes_likelihood
    posterior_no = prior_no * no_likelihood

    print(f"Posterior Yes: {posterior_yes}")
    print(f"Posterior No: {posterior_no}")

    return 'Yes' if posterior_yes > posterior_no else 'No'

# Now call the prediction function correctly
test_prediction = predict_tax_evasion('Yes', 'Single', 85000)
print(f"Prediction for Refund=Yes, Marital Status=Single, Taxable Income=85000: {test_prediction}")


Posterior Yes: 4.5715769464005176e-07
Posterior No: 1.5874032085856848e-06
Prediction for Refund=Yes, Marital Status=Single, Taxable Income=85000: No


EVALUATION OF MODEL


In [None]:

test_data = [
    ['Yes', 'Single', 85000],
    ['No', 'Married', 95000],
    ['No', 'Divorced', 60000],
    ['Yes', 'Single', 115000],
]

test_labels = ['No', 'Yes', 'Yes', 'No']

# Evaluate model
correct_predictions = 0

for i, instance in enumerate(test_data):
    refund, marital_status, taxable_income = instance
    prediction = predict_tax_evasion(refund, marital_status, taxable_income)
    print(f"Predicted: {prediction}, Actual: {test_labels[i]}")
    if prediction == test_labels[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(test_data)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Posterior Yes: 4.5715769464005176e-07
Posterior No: 1.5874032085856848e-06
Predicted: No, Actual: No
Posterior Yes: 2.2983147518749972e-06
Posterior No: 1.0743921373737253e-06
Predicted: Yes, Actual: Yes
Posterior Yes: 1.0797300213807536e-06
Posterior No: 7.028909370782407e-08
Predicted: Yes, Actual: Yes
Posterior Yes: 4.147241038825172e-08
Posterior No: 2.928072559129381e-06
Predicted: No, Actual: No
Model Accuracy: 100.00%
