<a href="https://colab.research.google.com/github/anthrofax/artificial-intelligence/blob/main/Tugas_Pertemuan_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi

In [None]:
# Load a CSV file
def load_csv(filename):
  dataset = list()
  with open(filename, 'r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset

In [None]:
#Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())

In [None]:
# Convert string column to integer
def str_column_to_int (dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate (unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [None]:
# Split a dataset into k folds

def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
      fold = list()
      while len(fold) < fold_size:
            index = randrange (len(dataset_copy))
            fold.append(dataset_copy.pop(index))
      dataset_split.append(fold)
    return dataset_split

In [None]:
#Calculate accuracy percentage

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [None]:
#Evaluate an algorithm using a cross validation split

def evaluate_algorithm(dataset, algorithm, n_folds, args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [None]:
#Split the dataset by class values, returns a dictionary

def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [None]:
#Calculate the mean of a list of numbers

def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [None]:
#Calculate the standard deviation of a list of numbers

def stdev(numbers):
    avg = mean (numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [None]:
# Calculate the mean, stdev and count for each column in a dataset

def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [None]:
#Split dataset by class then calculate statistics for each row

def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries= dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [None]:
#Calculate the Gaussian probability distribution function for x

def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean) **2 / (2 * stdev**2 )))
    return (1/(sqrt(2*pi) * stdev)) * exponent

In [None]:
# Calculate the probabilities of predicting each class for a given row

def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries [label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [None]:
# Predict the class for a given row

def predict(summaries, row):
    probabilities = calculate_class_probabilities (summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [None]:
# Naive Bayes Algorithm

def naive_bayes (train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

In [None]:
#Make a prediction with Naive Bayes on Iris Dataset

filename = 'iris.csv'
dataset = load_csv(filename)
print(dataset)
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
#convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'], ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'], ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'], ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'], ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'], ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'], ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'], ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'], ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'], ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'], ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'], ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'], ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'], ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'], ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'], ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'], ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'], ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa'], ['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'], ['5.1', '3.7', '1.5', '0.4', 'Iris-setosa'], ['4.6', '

{'Iris-setosa': 0, 'Iris-virginica': 1, 'Iris-versicolor': 2}

In [None]:
# fit model
model = summarize_by_class(dataset)

In [None]:
# define a new record
row = [5.7,2.9,4.2,1.3]

In [None]:
# predict the Label

label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: 2
