# CENG476 Machine Learning Homework-I Naive Bayes Classifier
# 171180758 Candan Baykan

## Dependencies

In [1]:
# Import required modules.
from IPython.display import display
import math
import pandas as pd

## Naive Bayes Implementation

In [2]:
# My Naive Bayes Classifier implementation.
class NaiveBayesClassifier:

    # Take the training data, target column's name, discrete columns' names and continuous columns' names.
    def __init__(self, df, target_column, discrete_columns, continuous_columns):
        self._df = df
        self._target_column = target_column
        self._discrete_columns = discrete_columns
        self._continuous_columns = continuous_columns
        self._target_dfs = dict()
        self._target_probabilities = dict()
        self._discrete_probabilities = dict()
        self._continuous_stats = dict()

    # Start training phase.
    def fit(self):
        for target in self._df[self._target_column].unique():
            # Store the data frame for each unique target value.
            self._target_dfs[target] = self._df[self._df[self._target_column] == target].drop(self._target_column, axis=1)
            # Store the probabilities for each unique target value.
            self._target_probabilities[target] = self._target_dfs[target].shape[0] / self._df.shape[0]

        # Store the probabilities for each discrete features.
        for column in self._discrete_columns:
            self._discrete_probabilities[column] = self._discrete_probability(column)

        # Store the statistics for each continuous features.
        self._continuous_stats = self._continuous_statistics()

    # Predict the target feature for each row.
    def predict(self, X):
        predictions = list()
        # Dictionary for storing multiplication results for each unique target value.
        products = dict.fromkeys(list(self._target_dfs.keys()))

        # Initialize products by one for each unique target value.
        for index, row in X.iterrows():
            for target in products:
                products[target] = 1

            # Get the probability from discrete_probabilities for each discrete column and unique target value
            # then multiply it.
            for column in self._discrete_columns:
                for target in products:
                    products[target] *= self._discrete_probabilities[column].loc[row[column], target]

            # Calculate the gaussian distribution for each continuous column and  unique target value the multiply it.
            for column in self._continuous_columns:
                for target in products:
                    products[target] *= self._gaussian_distribution(column, row[column], target)

            # Multiply the probability for each target value.
            for target in products:
                products[target] *= self._target_probabilities[target]

            # Append the target value which has the highest probability.
            predictions.append(max(products, key=products.get))

        return predictions

    # Calculate probabilities for a discrete feature.
    def _discrete_probability(self, column):
        # Get unique values for the column.
        feature_values = self._df[column].unique()
        # Get unique target values.
        target_names = list(self._target_dfs.keys())

        # Create data frame for probabilities.
        pdf = pd.DataFrame(index=feature_values, columns=target_names)

        # Calculate the probability and store it in data frame
        for value in feature_values:
            for target in target_names:
                tdf = self._target_dfs[target]
                pdf.loc[value, target] = tdf[tdf[column] == value].shape[0] / tdf.shape[0]

        return pdf

    # Calculate mean and standard deviation for continuous features.
    def _continuous_statistics(self):
         # Get unique target values.
        target_names = list(self._target_dfs.keys())
        columns = list()
        # Create mean and standard deviation columns for each unique target value.
        for target in target_names:
            columns.append(target + '_mean')
            columns.append(target + '_std')

        # Create data frame for statistics.
        sdf = pd.DataFrame(index=self._continuous_columns, columns=columns)

        # Get statistics for each continuous column and unique target value and store them in data frame.
        for column in self._continuous_columns:
            for target in target_names:
                sdf.loc[column, target + '_mean'] = self._target_dfs[target][column].mean()
                sdf.loc[column, target + '_std'] = self._target_dfs[target][column].std()

        return sdf

    # Apply Gaussian distribution formula for given column, value and target value.
    def _gaussian_distribution(self, column, value, target):
        mean = self._continuous_stats.loc[column, target + '_mean']
        variance = self._continuous_stats.loc[column, target + '_std'] ** 2
        return (1.0 / math.sqrt(2 * 3 * variance)) * (math.e ** (-1 * ((value - mean) ** 2) / (2 * variance)))

    def get_discrete_probabilities(self):
        return self._discrete_probabilities

    discrete_probabilities = property(fget=get_discrete_probabilities)

### Utility Functions for Accuracy Calculation

In [3]:
# Calculate total accuracy.
def calculate_accuracy(y_test, predictions):
    counter = 0
    for i in range(len(y_test)):
        if y_test[i] == predictions[i]:
            counter += 1

    return counter / len(y_test)

# Calculate given class' accuracy.
def calculate_class_accuracy(y_test, predictions, value):
    counter = 0
    positive_counter = 0
    for i in range(len(y_test)):
        if y_test[i] == value:
            counter += 1
            if y_test[i] == predictions[i]:
                positive_counter += 1

    return positive_counter / counter

## Training

In [4]:
# Read training data from CSV file then train the classifier.
df = pd.read_csv('Training_Penguins_data.csv')
nvc = NaiveBayesClassifier(df, 'sex', ['species', 'island'], ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_kg'])
nvc.fit()

display(nvc.discrete_probabilities['species'])
display(nvc.discrete_probabilities['island'])

Unnamed: 0,female,male
Adelie,0.411348,0.4
Gentoo,0.375887,0.386207
Chinstrap,0.212766,0.213793


Unnamed: 0,female,male
Torgersen,0.141844,0.137931
Biscoe,0.489362,0.489655
Dream,0.368794,0.372414


## Testing

In [5]:
# Read test data from CSV file then predict their classes using classifier.
tdf = pd.read_csv('Testing_Penguins_Data.csv')
y_test = tdf['sex']
X_test = tdf.drop('sex', axis=1)
predictions = nvc.predict(X_test)

### Accuracy Calculations

In [6]:
print(f'Total accuracy: {calculate_accuracy(y_test, predictions):.2f}')
print(f'Male accuracy: {calculate_class_accuracy(y_test, predictions, "male"):.2f}')
print(f'Female accuracy: {calculate_class_accuracy(y_test, predictions, "female"):.2f}')

Total accuracy: 0.67
Male accuracy: 0.50
Female accuracy: 0.84


### Confusion Matrix

In [7]:
confusion_matrix = pd.DataFrame(index=['Predicted Male', 'Predicted Female'], columns=['Actual Male', 'Actual Female'], data=[[0, 0], [0, 0]])

for i in range(len(y_test)):
    if predictions[i] == 'male' and y_test[i] == 'male':
        confusion_matrix.loc['Predicted Male', 'Actual Male'] += 1
    elif predictions[i] == 'male' and y_test[i] == 'female':
        confusion_matrix.loc['Predicted Male', 'Actual Female'] += 1
    elif predictions[i] == 'female' and y_test[i] == 'female':
        confusion_matrix.loc['Predicted Female', 'Actual Female'] += 1
    elif predictions[i] == 'female' and y_test[i] == 'male':
        confusion_matrix.loc['Predicted Female', 'Actual Male'] += 1

display(confusion_matrix)

Unnamed: 0,Actual Male,Actual Female
Predicted Male,12,4
Predicted Female,12,21
