<a href="https://colab.research.google.com/github/abdulla41mamun/CSE713-Advanced-Synctactic-Pattern-Recognition/blob/main/Naive_Bayes_Slide_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# 1. Create the dataset as a pandas DataFrame
# This data is taken directly from your presentation slide.
data = {
    'Age': [35, 30, 40, 35, 45, 35, 35, 25, 28, 35],
    'Income': ['Medium', 'High', 'Low', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 'Medium'],
    'Student': ['Yes', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes'],
    'Credit rating': ['Fair', 'Average', 'Good', 'Fair', 'Fair', 'Excellent', 'Good', 'Good', 'Average', 'Average'],
    'Buys computer': ['Yes', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(data)

print("--- Dataset ---")
print(df)
print("\n" + "="*50 + "\n")

# 2. Define the new instance 'X' to be classified
# This is the feature vector from your slide.
X = {'Age': 21, 'Income': 'Medium', 'Student': 'Yes', 'Credit rating': 'Fair'}
print(f"--- New Instance to Classify (X) ---\n{X}\n")
print("="*50 + "\n")


# 3. Calculate Prior Probabilities
print("--- Step 1: Calculate Prior Probabilities ---")
target_counts = df['Buys computer'].value_counts()
total_count = len(df)
prior_prob_yes = target_counts['Yes'] / total_count
prior_prob_no = target_counts['No'] / total_count

print(f"P(Buys computer = Yes) = {target_counts['Yes']}/{total_count} = {prior_prob_yes}")
print(f"P(Buys computer = No) = {target_counts['No']}/{total_count} = {prior_prob_no}")
print("\n" + "="*50 + "\n")


# 4. Calculate Conditional Probabilities
# Using Gaussian for 'Age' and the simple formula (count/total) for categoricals.

print("--- Step 2: Calculate Conditional Probabilities P(Feature | Class) ---")

# Helper function for Gaussian (Normal) distribution PDF
def gaussian_pdf(x, mean, std):
    """Calculates the probability density function for a given value x."""
    epsilon = 1e-9
    if std < epsilon:
        std = epsilon
    exponent = np.exp(-((x - mean)**2 / (2 * std**2)))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

# Separate the DataFrame based on the class label
df_yes = df[df['Buys computer'] == 'Yes']
df_no = df[df['Buys computer'] == 'No']

# Dictionary to store conditional probabilities
cond_prob = {'Yes': {}, 'No': {}}
categorical_features = ['Income', 'Student', 'Credit rating']

# --- Calculations for Class = 'Yes' ---
print("\n--- For Class: Buys computer = Yes ---")
age_mean_yes = df_yes['Age'].mean()
age_std_yes = df_yes['Age'].std()
cond_prob['Yes']['Age'] = gaussian_pdf(X['Age'], age_mean_yes, age_std_yes)
print(f"P(Age={X['Age']} | Yes) (Gaussian) -> {cond_prob['Yes']['Age']:.6f}")

for feature in categorical_features:
    value = X[feature]
    count_feature_yes = df_yes[df_yes[feature] == value].shape[0]
    # Using the simple formula: count / total_for_class
    prob_feature_yes = count_feature_yes / len(df_yes)
    cond_prob['Yes'][feature] = prob_feature_yes
    print(f"P({feature}={value} | Yes) = {count_feature_yes}/{len(df_yes)} = {prob_feature_yes:.4f}")

# --- Calculations for Class = 'No' ---
print("\n--- For Class: Buys computer = No ---")
age_mean_no = df_no['Age'].mean()
age_std_no = df_no['Age'].std()
cond_prob['No']['Age'] = gaussian_pdf(X['Age'], age_mean_no, age_std_no)
print(f"P(Age={X['Age']} | No) (Gaussian) -> {cond_prob['No']['Age']:.6f}")

for feature in categorical_features:
    value = X[feature]
    count_feature_no = df_no[df_no[feature] == value].shape[0]
    # Using the simple formula: count / total_for_class
    prob_feature_no = count_feature_no / len(df_no)
    cond_prob['No'][feature] = prob_feature_no
    # This will be 0/4 for Credit rating = Fair, leading to a final prob of 0
    print(f"P({feature}={value} | No) = {count_feature_no}/{len(df_no)} = {prob_feature_no:.4f}")

print("\n" + "="*50 + "\n")


# 5. Calculate Posterior Probabilities (Likelihood of the class)
print("--- Step 3: Calculate Likelihoods P(X | Class) * P(Class) ---")

likelihood_yes = prior_prob_yes
for feature in X:
    likelihood_yes *= cond_prob['Yes'][feature]

likelihood_no = prior_prob_no
for feature in X:
    likelihood_no *= cond_prob['No'][feature]

print(f"P(X | Yes) * P(Yes) = {likelihood_yes:.4e}")
print(f"P(X | No) * P(No) = {likelihood_no:.4e}  <-- Note: This is zero as expected")
print("\n" + "="*50 + "\n")


# 6. Make the classification
print("--- Step 4: Make Prediction ---")
if likelihood_yes > likelihood_no:
    print("Prediction: The person WILL buy a computer.")
    print(f"Reason: The probability for 'Yes' ({likelihood_yes:.4e}) is greater than for 'No' ({likelihood_no:.4e}).")
else:
    print("Prediction: The person will NOT buy a computer.")
    print(f"Reason: The probability for 'No' ({likelihood_no:.4e}) is greater than or equal to for 'Yes' ({likelihood_yes:.4e}).")
print("\n" + "="*50 + "\n")


# 7. Replicating the Logarithm Part from the Slides
print("--- Bonus: Using Logarithms to Avoid Underflow ---")
# We MUST add a small epsilon because log(0) is undefined and will cause an error.
epsilon = 1e-9

log_prob_yes = np.log(prior_prob_yes)
for feature in X:
    log_prob_yes += np.log(cond_prob['Yes'][feature] + epsilon)

log_prob_no = np.log(prior_prob_no)
for feature in X:
    log_prob_no += np.log(cond_prob['No'][feature] + epsilon)

print(f"Log-Probability(Yes) ≈ {log_prob_yes:.2f}")
print(f"Log-Probability(No) ≈ {log_prob_no:.2f}  <-- Note: This is a large negative number due to log(0+epsilon)")


print("\n--- Prediction using Log-Probabilities ---")
if log_prob_yes > log_prob_no:
    print("Prediction: The person WILL buy a computer.")
    print(f"Reason: The log-probability for 'Yes' ({log_prob_yes:.2f}) is greater than for 'No' ({log_prob_no:.2f}).")
else:
    print("Prediction: The person will NOT buy a computer.")
    print(f"Reason: The log-probability for 'No' ({log_prob_no:.2f}) is greater than or equal to for 'Yes' ({log_prob_yes:.2f}).")


--- Dataset ---
   Age  Income Student Credit rating Buys computer
0   35  Medium     Yes          Fair           Yes
1   30    High      No       Average            No
2   40     Low     Yes          Good            No
3   35  Medium      No          Fair           Yes
4   45     Low      No          Fair           Yes
5   35    High      No     Excellent           Yes
6   35  Medium      No          Good            No
7   25     Low      No          Good           Yes
8   28    High      No       Average            No
9   35  Medium     Yes       Average           Yes


--- New Instance to Classify (X) ---
{'Age': 21, 'Income': 'Medium', 'Student': 'Yes', 'Credit rating': 'Fair'}


--- Step 1: Calculate Prior Probabilities ---
P(Buys computer = Yes) = 6/10 = 0.6
P(Buys computer = No) = 4/10 = 0.4


--- Step 2: Calculate Conditional Probabilities P(Feature | Class) ---

--- For Class: Buys computer = Yes ---
P(Age=21 | Yes) (Gaussian) -> 0.005443
P(Income=Medium | Yes) = 3/6 = 0.5000
