In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

## Read data and there are totally 333 data and every data have 8 attributes

In [None]:
penguins_af = pd.read_csv('penguins_af.csv', index_col = 0) 
print(penguins_af.shape)

In [None]:
penguins_af

# Preprocessing the data

In [None]:
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap'])]
# set species attribute to -1 position and only kepp Adelie and Chinstrap values

In [None]:
penguins2C

In [None]:
labelencoder = LabelEncoder()
penguins2C['species'] = labelencoder.fit_transform(penguins2C['species'])

# Use LabelEncoder to convert two Category attributes to numbers(0, 1)

In [None]:
penguins2C


In [None]:
import random
data_df = penguins2C

# Split data into trainset and testset(every part of 10 data has 7 traindata and 3 testdata)
def splitData(data_list,ratio):
  train_size = int(len(data_list)*ratio)
  random.shuffle(data_list)
  train_set = data_list[:train_size]
  test_set = data_list[train_size:]
  return train_set, test_set

data_list = np.array(data_df).tolist()
trainset,testset = splitData(data_list,ratio = 0.7)
print('Split {0} samples into {1} train and {2} test samples '.format(len(data_df), len(trainset), len(testset)))

# Naive Bayes Classifier

In [None]:
def seprateByClass(dataset):
    seprate_dict = {}
    info_dict = {}
    for vector in dataset:
        if vector[-1] not in seprate_dict:
            seprate_dict[vector[-1]] = []
            info_dict[vector[-1]] = 0
        seprate_dict[vector[-1]].append(vector)
        info_dict[vector[-1]] += 1
    return seprate_dict, info_dict

train_separated, train_info = seprateByClass(trainset)
# classify data,train_separated includes two different category data and  train_info records number of them.

In [None]:
train_info

In [None]:
def calulateClassPriorProb(dataset,dataset_info):
    dataset_prior_prob = {}
    sample_sum = len(dataset)
    for class_value, sample_nums in dataset_info.items():
      dataset_prior_prob[class_value] = sample_nums/float(sample_sum)
    return dataset_prior_prob

prior_prob = calulateClassPriorProb(trainset, train_info)

#this aims for calculating class probability, for example 0 class is 100/(100+49)

In [None]:
prior_prob

In [None]:
def mean(list):
    list = [float(x) for x in list]
    return sum(list) / float(len(list))


def var(list):
    list = [float(x) for x in list]
    avg = mean(list)
    var = sum([math.pow((x - avg), 2) for x in list]) / float(len(list) - 1)
    return var

# The conditional probabilities
def calculateProb(x, mean, var):
    exponent = math.exp(math.pow((x - mean), 2) / (-2 * var))
    p = (1 / math.sqrt(2 * math.pi * var)) * exponent
    return p

def summarizeAttribute(dataset):
    dataset = np.delete(dataset, -1, axis=1)  # delete label
    summaries = [(mean(attr), var(attr)) for attr in zip(*dataset)]
    return summaries

summary = summarizeAttribute(trainset) # Calculate every attribute's mean and var

def summarizeByClass(dataset):
    dataset_separated, dataset_info = seprateByClass(dataset)
    summarize_by_class = {}
    for classValue, vector in dataset_separated.items():
        summarize_by_class[classValue] = summarizeAttribute(vector)
    return summarize_by_class

train_Summary_by_class = summarizeByClass(trainset) # Calculate every attribute's mean and var based on class

# Above methods are the fit method,being devided into several parts of calculations

def calculateClassProb(input_data, train_Summary_by_class):
    prob = {}
    for class_value, summary in train_Summary_by_class.items():
        prob[class_value] = 1
        for i in range(len(summary)):
            mean, var = summary[i]
            x = input_data[i]
            p = calculateProb(x, mean, var)
            prob[class_value] *= p
    return prob

#Multiplies the conditional probabilities of each attribute by class.
input_vector = testset[1]
input_data = input_vector[:-1]
train_Summary_by_class = summarizeByClass(trainset)
class_prob = calculateClassProb(input_data, train_Summary_by_class)




In [None]:
def bayesianPredictOneSample(input_data):#predict single data
    prior_prob = calulateClassPriorProb(trainset, train_info)
    train_Summary_by_class = summarizeByClass(trainset)
    classprob_dict = calculateClassProb(input_data, train_Summary_by_class)
    result = {}
    for class_value, class_prob in classprob_dict.items():
        p = class_prob * prior_prob[class_value]
        result[class_value] = p
    return max(result, key=result.get)


In [None]:
testset[:10]

In [None]:
input_vector = testset[8]
input_data = input_vector[:-1]
result = bayesianPredictOneSample(input_data)
print("the sameple is predicted to class: {0}.".format(result))


In [None]:
def calculateAccByBeyesian(dataset):
  correct = 0
  for vector in dataset:
      input_data = vector[:-1]
      label = vector[-1]
      result = bayesianPredictOneSample(input_data)
      if result == label:
          correct+=1
  return correct/len(dataset)

acc = calculateAccByBeyesian(testset)


In [None]:
print("The Accuracy of our Naive Bayes is:",int(acc*100),"%")