<a href="https://colab.research.google.com/github/benmanjackson/CS290/blob/main/partnerProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from scipy.stats import norm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Data Exploration

In [None]:
np.random.seed(17)

In [None]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [None]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [None]:
penguins[['island']].value_counts()

Unnamed: 0_level_0,count
island,Unnamed: 1_level_1
Biscoe,168
Dream,124
Torgersen,52


In [None]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


# NaiveBayesClassifier

In [None]:
def calculate_priors(y):
    return y.value_counts( normalize=True ).to_dict()  #Calculate prior probabilities for each class

In [None]:
# Calcultate conditional probabilities for y(target) given X(features)
# P(B|A)
def calculate_likelihoods(X, y):
    likelihoods = {}  # Initialize dict
    for feature in X.columns:  # Loop through all features
        likelihoods[feature] = {}  # Initialize nested likelihood dict
        # Check if the feature is categorical or numerical
        if X[feature].dtype == 'object' or X[feature].dtype.name == 'category':
            # Categorical feature
            for cls in y.unique():
                subset = X[y == cls]
                value_counts = subset[feature].value_counts()
                total_count = len(subset)

                # Initialize the dictionary for each class if it doesn't exist
                likelihoods[feature][cls] = {}

                # Apply Laplace smoothing for seen
                for value in value_counts.index:
                    # Ensure the likelihood dictionary is initialized
                    likelihoods[feature][cls][value] = (value_counts[value] + 1) / (total_count + len(value_counts))

                # Handle unseen values by assigning a small probability
                likelihoods[feature][cls]['__UNSEEN__'] = 1 / (total_count + len(value_counts))
        else:
            # Numerical features (Assume Gaussian distribution)
            for cls in y.unique():
                subset = X[y == cls]
                mean = subset[feature].mean()
                std = subset[feature].std()

                # Initialize the dictionary for each class if it doesn't exist
                likelihoods[feature][cls] = {'mean': mean, 'std': std}

    return likelihoods

In [None]:
def predict_naive_bayes(instance, priors, likelihoods, classes):
    posteriors = {}

    for clas in classes:
        # Start with log(prior) prob for each class
        posterior = np.log(priors[clas])
        # Loop through all features and corresponding values for the new instance
        for feature, value in instance.items():
            if isinstance(value, str):  # Check if the feature's value is categorical
                if value in likelihoods[feature].get(clas, {}):
                    likelihood = likelihoods[feature][clas].get(value, 1e-6)  # Smoothing for unseen values
                    posterior += np.log(likelihood)
            else:  # Numerical case (Gaussian likelihood)
                if 'mean' in likelihoods[feature][clas]:
                    mean = likelihoods[feature][clas]['mean']
                    std = likelihoods[feature][clas]['std']
                    # Gaussian likelihood using PDF formula
                    if std > 0:  # Prevent division by zero, -0.5 for shape of distribution needed for gaussian
                        likelihood = (1 / (std * np.sqrt(2 * np.pi))) * np.exp(-0.5 * ((value - mean) ** 2 / std ** 2))
                        posterior += np.log(likelihood)

        # Store the posterior probability for the current class
        posteriors[clas] = posterior

    # Return the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)

In [None]:
def naive_bayes(X, y, new_instance):
  priors = calculate_priors(y)    # Calculate priors
  likelihoods = calculate_likelihoods(X, y)   # Calculate likelihoods
  classes = y.unique()    # get all classes
  return predict_naive_bayes(new_instance, priors, likelihoods, classes)

In [None]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']]
y = penguins['species']

In [None]:
new_penguin_instance = {
    'bill_length_mm': 45.0,
    'bill_depth_mm': 14.0,
    'flipper_length_mm': 210.0,
    'body_mass_g': 4500,
    'sex': 'Male'
}

In [None]:
predicted_species = naive_bayes(X, y, new_penguin_instance)
print(f"Predicted species: {predicted_species}")

Predicted species: Gentoo


# SciKit Learn Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
#Load fresh penguins dataset for Sklearn comparison
gauss_penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [None]:
#Encode our target variable and split dataset into categorical and numerical
gauss_penguins['species'] = gauss_penguins['species'].astype('category').cat.codes
categorical_features = ['island', 'sex']
numerical_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = gauss_penguins.drop(columns=['species'])
y = gauss_penguins['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X[categorical_features], y, test_size=0.2, random_state=42)


In [None]:
#Function for our custom Naive Bayes prediction
def our_naive_bayes_prediction(X, y, X_test):
    predictions = []

    for _, instance in X_test.iterrows():
        instance_dict = instance.to_dict()
        prediction = naive_bayes(X, y, instance_dict)
        predictions.append(prediction)
    return predictions

In [None]:
#Analyzing our accuracy of our method
custom_predictions = our_naive_bayes_prediction(X_train, y_train, X_test)
cat_custom_predictions = our_naive_bayes_prediction(X_train_cat, y_train_cat, X_test_cat)
cat_custom_accuracy = accuracy_score(y_test_cat, cat_custom_predictions)
print(f"Custom Categorical Naive Bayes Accuracy: {cat_custom_accuracy}")
custom_accuracy = accuracy_score(y_test, custom_predictions)
print(f"Custom Naive Bayes Accuracy: {custom_accuracy}")

Custom Categorical Naive Bayes Accuracy: 0.34782608695652173
Custom Naive Bayes Accuracy: 0.9565217391304348


In [None]:
#GaussianNB Pipeline + Accuracy
gaussian_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([('imputer', SimpleImputer()), ('passthrough', 'passthrough')]), numerical_features), # Impute numerical features
        ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder())]), categorical_features) # Impute categorical features
    ])

gaussian_pipeline = Pipeline([
    ('preprocessor', gaussian_preprocessor),
    ('classifier', GaussianNB())
])

gaussian_pipeline.fit(X_train, y_train)
gaussian_predictions = gaussian_pipeline.predict(X_test)
gaussian_accuracy = accuracy_score(y_test, gaussian_predictions)
print(f"Gaussian Naive Bayes Accuracy: {gaussian_accuracy}")

Gaussian Naive Bayes Accuracy: 0.855072463768116


In [None]:
#CategoricalNB Pipeline
categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean'))  # Impute with mean for numerical features
        ]), numerical_features),

        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute with most frequent for categorical
            ('ordinal', OrdinalEncoder())                           # Ordinal encode for categorical
        ]), categorical_features)
    ])
categorical_pipeline = Pipeline([
    ('preprocessor', categorical_preprocessor),
    ('classifier', CategoricalNB())
])
categorical_pipeline.fit(X_train, y_train)
categorical_predictions = categorical_pipeline.predict(X_test)
categorical_accuracy = accuracy_score(y_test, categorical_predictions)
print(f"Categorical Naive Bayes Accuracy: {categorical_accuracy}")

Categorical Naive Bayes Accuracy: 1.0


In [None]:
print("GaussianNB Pipeline:", gaussian_pipeline)
print("CategoricalNB Pipeline:", categorical_pipeline)

GaussianNB Pipeline: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('passthrough',
                                                                   'passthrough')]),
                                                  ['bill_length_mm',
                                                   'bill_depth_mm',
                                                   'flipper_length_mm',
                                                   'body_mass_g']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                      