# **Dhruv Karmokar**
# **21BAI1604**

## **Lab 4 - Naive Bayes Classifier**

### **Naive Bayes Classification with Cross-Validation**

In [90]:
# Import necessary modules and functions
import csv
from math import sqrt, exp, pi

# Load a CSV file and return the dataset as a list of lists
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            if row:
                dataset.append(row)
    return dataset

# Convert a specific column in the dataset from string to float
def str_column_to_float(dataset, column):
    for row in dataset:
        value = row[column].strip()
        if value:
            row[column] = float(value)
        else:
            row[column] = None

# Convert a specific column in the dataset from string to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split the dataset by class values and return a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    filtered_numbers = [num for num in numbers if num is not None]
    return sum(filtered_numbers) / float(len(filtered_numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    filtered_numbers = [num for num in numbers if num is not None]
    variance = sum([(x - avg) ** 2 for x in filtered_numbers]) / float(len(filtered_numbers) - 1)
    return sqrt(variance)


# Calculate the mean, standard deviation, and count for each column in the dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del summaries[-1]
    return summaries

# Split dataset by class and calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Set the random seed for reproducibility
seed(1)

# Specify the filename of the CSV dataset
filename = '/content/dataset.csv'

# Load the CSV dataset
dataset = load_csv(filename)

# Convert the columns of the dataset from string to float
for i in range(len(dataset[0]) - 1):
    str_column_to_float(dataset, i)

# Convert the class column to integers
str_column_to_int(dataset, len(dataset[0]) - 1)

# Summarize the dataset by class
summaries = summarize_by_class(dataset)

# Retrieve the new record from the dataset
new_record = dataset[0] 

# Calculate the probabilities of predicting each class for the new record
probabilities = calculate_class_probabilities(summaries, new_record)

# Print the calculated probabilities
print(probabilities)


{0: 6.909375512489914e-13, 1: 6.379088344378128e-13}


### **Naive Bayes Classification with Accuracy Evaluation**

In [91]:
# Import necessary modules and functions
from csv import reader
from random import seed, randrange
from math import sqrt, exp, pi

# Load a CSV file and return the dataset as a list of lists
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset


# Convert a specific column in the dataset from string to float
def str_column_to_float(dataset, column):
    for row in dataset:
        if row[column].strip() == "":
            row[column] = 0.0  # replace empty string with 0.0
        else:
            row[column] = float(row[column].strip())

# Convert a specific column in the dataset from string to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# Split the dataset by class values and return a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

# Calculate the mean, standard deviation, and count for each column in the dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del summaries[-1]
    return summaries

# Split dataset by class and calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Naive Bayes Algorithm
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return predictions

# Set the random seed
seed(1)

# Specify the filename of the CSV dataset
filename = '/content/dataset.csv'  # Update with the path to your dataset

# Load the CSV dataset
dataset = load_csv(filename)

# Convert the columns of the dataset from string to float
for i in range(len(dataset[0]) - 1):
    str_column_to_float(dataset, i)

# Convert the class column to integers
class_column_index = len(dataset[0]) - 1  # Update with the index of your class column
str_column_to_int(dataset, class_column_index)

# Evaluate the Naive Bayes algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))


Scores: [80.5, 81.0, 79.5, 76.5, 80.0]
Mean Accuracy: 79.500%


### **Naive Bayes Prediction on the Dataset**

In [92]:
# Make Predictions with Naive Bayes On The Dataset
from csv import reader
from math import sqrt
from math import exp
from math import pi

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        value = row[column].strip()
        if value:
            try:
                row[column] = float(value)
            except ValueError:
                row[column] = 0.0  # Assign a default value or skip the row if needed
        else:
            row[column] = 0.0  # Assign a default value or skip the row if needed

# Convert string column to integer
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
        print('[%s] => %d' % (value, i))  # Print mapping of class labels to integers
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

# Calculate the mean, stdev, and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = []
    for column in zip(*dataset):
        column_values = [value for value in column if isinstance(value, float)]
        if column_values:
            mean_val = mean(column_values)
            stdev_val = stdev(column_values)
            count_val = len(column_values)
            summaries.append((mean_val, stdev_val, count_val))
    return summaries


# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Calculate class probabilities for a given row
def calculate_class_probabilities(summaries, row):
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean_val, stdev_val, _ = class_summaries[i]
            x = row[i]
            probabilities[class_value] *= calculate_probability(x, mean_val, stdev_val)
    return probabilities

# Make a prediction with Naive Bayes on the Dataset
filename = '/content/dataset.csv'
dataset = load_csv(filename)
for i in range(1, len(dataset[0])):
    str_column_to_float(dataset, i)
# Convert class column to integers
str_column_to_int(dataset, len(dataset[0]) - 1)
# Fit model
model = summarize_by_class(dataset)

# Define new records
rows = [
    [-0.31, -0.54, -0.43, -0.54, -0.31, -0.54, 0.57, -0.82, -1.24, -2.07],
    [-1.42, 2.62, -0.43, 2.62, 2.42, 2.62, -1.05, 3.24, 4.4, 0.02],
    [-1.29, 0.16, -0.35, 0.16, -0.72, 0.16, 0.22, -1.11, -0.95, 0.79]
]

# Predict the labels
for row in rows:
    label = predict(model, row)
    print('Data=%s, Predicted: %s' % (row, label))


[0.0] => 0
[1.0] => 1
Data=[-0.31, -0.54, -0.43, -0.54, -0.31, -0.54, 0.57, -0.82, -1.24, -2.07], Predicted: 0
Data=[-1.42, 2.62, -0.43, 2.62, 2.42, 2.62, -1.05, 3.24, 4.4, 0.02], Predicted: 0
Data=[-1.29, 0.16, -0.35, 0.16, -0.72, 0.16, 0.22, -1.11, -0.95, 0.79], Predicted: 1
