In [1]:
import os
import numpy as np
import concurrent.futures
import logging
import pickle
import time
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
working_dir = "/Users/emre/GitHub/HU-AI/AIN313/Assignment 2"
DATASET_PATH = os.path.join(working_dir, "dataset")
PROCESSED_DATASET_PATH = os.path.join(DATASET_PATH, "processed")
MODELS_PATH = os.path.join(working_dir, "models")
GRAPH_PATH = os.path.join(working_dir, "graphs")
OUTPUTS_PATH = os.path.join(working_dir, "outputs")

In [3]:
train_labels = np.load(os.path.join(PROCESSED_DATASET_PATH, "train_labels.npy"))
test_labels = np.load(os.path.join(PROCESSED_DATASET_PATH, "test_labels.npy"))

experiment_dict = {
    "train_rgb": os.path.join(PROCESSED_DATASET_PATH, "train_rgb.npy"),
    "test_rgb": os.path.join(PROCESSED_DATASET_PATH, "test_rgb.npy"),
    "train_infra": os.path.join(PROCESSED_DATASET_PATH, "train_infra.npy"),
    "test_infra": os.path.join(PROCESSED_DATASET_PATH, "test_infra.npy"),
    "train_whole": os.path.join(PROCESSED_DATASET_PATH, "train_whole.npy"),
    "test_whole": os.path.join(PROCESSED_DATASET_PATH, "test_whole.npy"),
}

In [4]:
class NaiveBayesClassifier:
    def __init__(
        self,
        verbose=False,
        multithreading=False,
        num_threads=None,
        save_model=False,
        model_file=None,
        batch_size=1000,
    ):
        self.verbose = verbose
        self.class_summary = None
        self.multithreading = multithreading
        self.num_threads = num_threads
        self.save_model = save_model
        self.model_file = model_file
        self.batch_size = batch_size
        self.logger = self._setup_logger()

    def _setup_logger(self):
        logger = logging.getLogger(__name__)
        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        stream_handler = logging.StreamHandler()
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)
        logger.setLevel(logging.INFO)
        return logger

    def log(self, message):
        self.logger.info(message)

    def separate_classes(self, X, y):
        separated_classes = {}
        for i, class_name in enumerate(y):
            if class_name not in separated_classes:
                separated_classes[class_name] = []
            separated_classes[class_name].append(X[i])
        return separated_classes

    def stat_info(self, X):
        return [
            {"std": np.std(feature), "mean": np.mean(feature)} for feature in zip(*X)
        ]

    def fit(self, X, y):
        start_time = time.time()

        self.log(f"Training the model using {self.num_threads} thread(s)...")
        separated_classes = self.separate_classes(X, y)
        self.class_summary = {}

        executor_class = (
            concurrent.futures.ThreadPoolExecutor
            if self.multithreading
            else concurrent.futures.ProcessPoolExecutor
        )
        with executor_class(max_workers=self.num_threads) as executor:
            futures = {
                executor.submit(
                    self._fit_class, class_name, feature_values, X
                ): class_name
                for class_name, feature_values in separated_classes.items()
            }

            for future in concurrent.futures.as_completed(futures):
                class_name = futures[future]
                self.class_summary[class_name] = future.result()

        elapsed_time = time.time() - start_time
        self.log(f"Training completed in {elapsed_time:.2f} seconds.")

        if self.save_model:
            self.save_model_to_file()

        return self.class_summary

    def _fit_class(self, class_name, feature_values, X):
        result = {
            "prior_proba": len(feature_values) / len(X),
            "summary": {
                "std": np.std(feature_values, axis=0),
                "mean": np.mean(feature_values, axis=0),
            },
        }
        self.log(f"Class {class_name} trained.")
        return result

    def distribution(self, x, mean, std):
        exponent = np.exp(-((x - mean) ** 2) / (2 * std**2))
        return exponent / (np.sqrt(2 * np.pi) * std)

    def predict(self, X):
        start_time = time.time()

        if self.class_summary is None and self.model_file:
            self.load_model_from_file()

        self.log(
            f"Predicting the class using {self.num_threads} thread(s) and batch size {self.batch_size}..."
        )

        predictions = []

        executor_class = (
            concurrent.futures.ThreadPoolExecutor
            if self.multithreading
            else concurrent.futures.ProcessPoolExecutor
        )
        with executor_class(max_workers=self.num_threads) as executor:
            batch_size = self.batch_size
            num_batches = len(X) // batch_size
            remainder = len(X) % batch_size

            futures = {
                executor.submit(self._predict_batch, batch, i): i
                for i, batch in enumerate(np.array_split(X, num_batches))
            }

            if remainder > 0:
                last_batch_index = num_batches
                last_batch = X[last_batch_index * batch_size :]
                futures.update(
                    {
                        executor.submit(
                            self._predict_batch, last_batch, last_batch_index
                        ): last_batch_index
                    }
                )

            predictions = []
            actual_sizes = []
            for future in concurrent.futures.as_completed(futures):
                futures[future]
                pred_batch, actual_size = future.result()
                predictions.extend(pred_batch)
                actual_sizes.append(actual_size)

        elapsed_time = time.time() - start_time
        self.log(f"Prediction completed in {elapsed_time:.2f} seconds.")

        return np.array(predictions)[: sum(actual_sizes)]  # Adjust the slicing

    def _predict_batch(self, batch, batch_index):
        self.log(f"Running prediction for batch {batch_index}...")
        predictions = np.concatenate([self._predict_row(row) for row in batch])
        return predictions, len(batch)  # Return the batch size

    def _predict_row(self, row):
        max_log_proba = float("-inf")
        predicted_class = None
        predictions = []

        for class_name, features in self.class_summary.items():
            log_likelihood = np.sum(
                np.log(
                    self.distribution(
                        row,
                        features["summary"]["mean"],
                        features["summary"]["std"] + 1e-8,  # Add a small constant
                    )
                )
            )
            log_posterior = np.log(features["prior_proba"]) + log_likelihood

            if log_posterior > max_log_proba:
                max_log_proba = log_posterior
                predicted_class = class_name

        predictions.append(predicted_class)

        return np.array(predictions)

    def accuracy(self, y_test, y_pred):
        if len(y_test) != len(y_pred):
            raise ValueError("Length of y_test and y_pred must be the same.")

        accuracy_value = np.sum(np.array(y_test) == np.array(y_pred)) / len(y_test)
        self.log(f"Accuracy: {accuracy_value}")
        return accuracy_value

    def save_model_to_file(self, file_path=None):
        if file_path is None:
            file_path = self.model_file

        if file_path:
            with open(file_path, "wb") as file:
                pickle.dump(self.class_summary, file)
                self.log(f"Model saved to {file_path}.")

    def load_model_from_file(self, file_path=None):
        if file_path is None:
            file_path = self.model_file

        if file_path:
            try:
                with open(file_path, "rb") as file:
                    self.class_summary = pickle.load(file)
                self.log(f"Model loaded from {file_path}.")
            except FileNotFoundError:
                self.log(f"No model file found at {file_path}. Training a new model.")
            except Exception as e:
                self.log(f"Error loading the model: {e}. Training a new model.")

### RGB Model

In [5]:
rgb_train_data = np.load(experiment_dict["train_rgb"])
print(
    "Train data shape:", rgb_train_data.shape, "Train data type:", rgb_train_data.dtype
)
print(
    "Train labels shape:", train_labels.shape, "Train labels type:", train_labels.dtype
)

rgb_model_path = os.path.join(MODELS_PATH, "naive_bayes_rgb.pkl")
rgb_model = NaiveBayesClassifier(
    verbose=True,
    multithreading=True,
    num_threads=8,
    save_model=True,
    model_file=rgb_model_path,
    batch_size=10000,
)

# check if model file not exists
if not os.path.exists(rgb_model_path):
    rgb_model.fit(rgb_train_data, train_labels)

rgb_test_data = np.load(experiment_dict["test_rgb"])
print("Test data shape:", rgb_test_data.shape, "Test data type:", rgb_test_data.dtype)

rgb_test_pred = rgb_model.predict(rgb_test_data)
print("Test pred shape:", rgb_test_pred.shape, "Test pred type:", rgb_test_pred.dtype)

rgb_acc = rgb_model.accuracy(test_labels, rgb_test_pred[: len(test_labels)])
print("RGB Accuracy:", rgb_acc)

2023-11-15 18:45:59,409 - INFO - Model loaded from /Users/emre/GitHub/HU-AI/AIN313/Assignment 2/models/naive_bayes_rgb.pkl.
2023-11-15 18:45:59,411 - INFO - Predicting the class using 8 thread(s) and batch size 10000...
2023-11-15 18:45:59,415 - INFO - Running prediction for batch 0...
2023-11-15 18:45:59,416 - INFO - Running prediction for batch 1...
2023-11-15 18:45:59,422 - INFO - Running prediction for batch 2...


Train data shape: (37102350, 3) Train data type: float32
Train labels shape: (37102350,) Train labels type: int8
Test data shape: (15892956, 3) Test data type: float32


2023-11-15 18:45:59,423 - INFO - Running prediction for batch 3...
2023-11-15 18:45:59,430 - INFO - Running prediction for batch 4...
2023-11-15 18:45:59,472 - INFO - Running prediction for batch 5...
2023-11-15 18:45:59,491 - INFO - Running prediction for batch 6...
2023-11-15 18:45:59,523 - INFO - Running prediction for batch 7...
2023-11-15 18:46:04,183 - INFO - Running prediction for batch 8...
2023-11-15 18:46:04,864 - INFO - Running prediction for batch 9...
2023-11-15 18:46:05,042 - INFO - Running prediction for batch 10...
2023-11-15 18:46:05,314 - INFO - Running prediction for batch 11...
2023-11-15 18:46:05,994 - INFO - Running prediction for batch 12...
2023-11-15 18:46:06,406 - INFO - Running prediction for batch 13...
2023-11-15 18:46:07,810 - INFO - Running prediction for batch 14...
2023-11-15 18:46:08,442 - INFO - Running prediction for batch 15...
2023-11-15 18:46:08,817 - INFO - Running prediction for batch 16...
2023-11-15 18:46:09,736 - INFO - Running prediction for

Test pred shape: (15895912,) Test pred type: int8


ValueError: Length of y_test and y_pred must be the same.

### Infra Model

In [8]:
infra_train_data = np.load(experiment_dict["train_infra"])
print(
    "Train data shape:",
    infra_train_data.shape,
    "Train data type:",
    infra_train_data.dtype,
)
print(
    "Train labels shape:", train_labels.shape, "Train labels type:", train_labels.dtype
)

infra_model_path = os.path.join(MODELS_PATH, "naive_bayes_infra.pkl")
infra_model = NaiveBayesClassifier(
    verbose=True,
    multithreading=True,
    num_threads=8,
    save_model=True,
    model_file=rgb_model_path,
    batch_size=10000,
)

# check if model file not exists
if not os.path.exists(infra_model_path):
    infra_model.fit(infra_train_data, train_labels)

infra_test_data = np.load(experiment_dict["test_infra"])
print(
    "Test data shape:", infra_test_data.shape, "Test data type:", infra_test_data.dtype
)

infra_test_pred = infra_model.predict(infra_test_data)
print(
    "Test pred shape:", infra_test_pred.shape, "Test pred type:", infra_test_pred.dtype
)

infra_acc = infra_model.accuracy(test_labels, infra_test_pred[: len(test_labels)])
print("Infra Accuracy:", infra_acc)

2023-11-15 19:10:40,356 - INFO - Training the model using 8 thread(s)...
2023-11-15 19:10:40,356 - INFO - Training the model using 8 thread(s)...


Train data shape: (37102350, 3) Train data type: float32
Train labels shape: (37102350,) Train labels type: int8


2023-11-15 19:10:52,123 - INFO - Class 5 trained.
2023-11-15 19:10:52,123 - INFO - Class 5 trained.
2023-11-15 19:10:52,250 - INFO - Class 7 trained.
2023-11-15 19:10:52,250 - INFO - Class 7 trained.
2023-11-15 19:10:52,506 - INFO - Class 9 trained.
2023-11-15 19:10:52,506 - INFO - Class 9 trained.
2023-11-15 19:10:53,324 - INFO - Class 4 trained.
2023-11-15 19:10:53,324 - INFO - Class 4 trained.
2023-11-15 19:10:56,158 - INFO - Class 3 trained.
2023-11-15 19:10:56,158 - INFO - Class 3 trained.
2023-11-15 19:11:37,923 - INFO - Class 1 trained.
2023-11-15 19:11:37,923 - INFO - Class 1 trained.
2023-11-15 19:11:50,862 - INFO - Class 2 trained.
2023-11-15 19:11:50,862 - INFO - Class 2 trained.
2023-11-15 19:11:55,974 - INFO - Class 8 trained.
2023-11-15 19:11:55,974 - INFO - Class 8 trained.
2023-11-15 19:11:57,761 - INFO - Class 0 trained.
2023-11-15 19:11:57,761 - INFO - Class 0 trained.
2023-11-15 19:12:02,196 - INFO - Class 6 trained.
2023-11-15 19:12:02,196 - INFO - Class 6 trained.


Test data shape: (15892956, 3) Test data type: float32


2023-11-15 19:12:05,050 - INFO - Running prediction for batch 4...
2023-11-15 19:12:05,068 - INFO - Running prediction for batch 5...
2023-11-15 19:12:05,068 - INFO - Running prediction for batch 5...
2023-11-15 19:12:05,075 - INFO - Running prediction for batch 6...
2023-11-15 19:12:05,087 - INFO - Running prediction for batch 7...
2023-11-15 19:12:05,075 - INFO - Running prediction for batch 6...
2023-11-15 19:12:05,087 - INFO - Running prediction for batch 7...
2023-11-15 19:12:09,921 - INFO - Running prediction for batch 8...
2023-11-15 19:12:09,921 - INFO - Running prediction for batch 8...
2023-11-15 19:12:10,266 - INFO - Running prediction for batch 9...
2023-11-15 19:12:10,266 - INFO - Running prediction for batch 9...
2023-11-15 19:12:10,830 - INFO - Running prediction for batch 10...
2023-11-15 19:12:10,830 - INFO - Running prediction for batch 10...
2023-11-15 19:12:10,849 - INFO - Running prediction for batch 11...
2023-11-15 19:12:10,924 - INFO - Running prediction for bat

Test pred shape: (15895912,) Test pred type: int8
Infra Accuracy: 0.5980504822387981


### Whole Model

In [9]:
whole_train_data = np.load(experiment_dict["train_whole"])
print(
    "Train data shape:",
    whole_train_data.shape,
    "Train data type:",
    whole_train_data.dtype,
)
print(
    "Train labels shape:", train_labels.shape, "Train labels type:", train_labels.dtype
)

whole_model_path = os.path.join(MODELS_PATH, "naive_bayes_whole.pkl")
whole_model = NaiveBayesClassifier(
    verbose=True,
    multithreading=True,
    num_threads=8,
    save_model=True,
    model_file=rgb_model_path,
    batch_size=10000,
)

# check if model file not exists
if not os.path.exists(whole_model_path):
    whole_model.fit(whole_train_data, train_labels)

whole_test_data = np.load(experiment_dict["test_whole"])
print(
    "Test data shape:", whole_test_data.shape, "Test data type:", whole_test_data.dtype
)

whole_test_pred = whole_model.predict(whole_test_data)
print(
    "Test pred shape:", whole_test_pred.shape, "Test pred type:", whole_test_pred.dtype
)

whole_acc = whole_model.accuracy(test_labels, whole_test_pred[: len(test_labels)])
print("Whole Accuracy:", whole_acc)

2023-11-15 19:28:09,837 - INFO - Training the model using 8 thread(s)...
2023-11-15 19:28:09,837 - INFO - Training the model using 8 thread(s)...
2023-11-15 19:28:09,837 - INFO - Training the model using 8 thread(s)...


Train data shape: (37102350, 6) Train data type: float32
Train labels shape: (37102350,) Train labels type: int8


2023-11-15 19:28:19,435 - INFO - Class 5 trained.
2023-11-15 19:28:19,435 - INFO - Class 5 trained.
2023-11-15 19:28:19,435 - INFO - Class 5 trained.
2023-11-15 19:28:19,561 - INFO - Class 7 trained.
2023-11-15 19:28:19,561 - INFO - Class 7 trained.
2023-11-15 19:28:19,561 - INFO - Class 7 trained.
2023-11-15 19:28:19,821 - INFO - Class 9 trained.
2023-11-15 19:28:19,821 - INFO - Class 9 trained.
2023-11-15 19:28:19,821 - INFO - Class 9 trained.
2023-11-15 19:28:20,702 - INFO - Class 4 trained.
2023-11-15 19:28:20,702 - INFO - Class 4 trained.
2023-11-15 19:28:20,702 - INFO - Class 4 trained.
2023-11-15 19:28:24,026 - INFO - Class 3 trained.
2023-11-15 19:28:24,026 - INFO - Class 3 trained.
2023-11-15 19:28:24,026 - INFO - Class 3 trained.
2023-11-15 19:29:06,782 - INFO - Class 1 trained.
2023-11-15 19:29:06,782 - INFO - Class 1 trained.
2023-11-15 19:29:06,782 - INFO - Class 1 trained.
2023-11-15 19:29:20,915 - INFO - Class 2 trained.
2023-11-15 19:29:20,915 - INFO - Class 2 trained.


Test data shape: (15892956, 6) Test data type: float32


2023-11-15 19:29:35,663 - INFO - Running prediction for batch 4...
2023-11-15 19:29:35,663 - INFO - Running prediction for batch 5...
2023-11-15 19:29:35,662 - INFO - Running prediction for batch 3...
2023-11-15 19:29:35,663 - INFO - Running prediction for batch 4...
2023-11-15 19:29:35,663 - INFO - Running prediction for batch 5...
2023-11-15 19:29:35,678 - INFO - Running prediction for batch 6...
2023-11-15 19:29:35,694 - INFO - Running prediction for batch 7...
2023-11-15 19:29:35,663 - INFO - Running prediction for batch 5...
2023-11-15 19:29:35,678 - INFO - Running prediction for batch 6...
2023-11-15 19:29:35,678 - INFO - Running prediction for batch 6...
2023-11-15 19:29:35,694 - INFO - Running prediction for batch 7...
2023-11-15 19:29:35,694 - INFO - Running prediction for batch 7...
2023-11-15 19:29:40,472 - INFO - Running prediction for batch 8...
2023-11-15 19:29:40,472 - INFO - Running prediction for batch 8...
2023-11-15 19:29:40,481 - INFO - Running prediction for batch 

Test pred shape: (15895912,) Test pred type: int8
Whole Accuracy: 0.6916292979103447


### Show Results

In [2]:
# Create a dataframe to store the results
df_results = pd.DataFrame(
    {
        "Model": ["RGB", "Infra", "Whole"],
        "Accuracy": [rgb_acc, infra_acc, whole_acc],
    }
)

# Save the DataFrame to a CSV file
df_results.to_csv(os.path.join(OUTPUTS_PATH, "naive_bayes_results.csv"), index=False)

# Plot the results as a bar plot with numbers over the bars
plt.figure(figsize=(10, 5))
plt.title("Naive Bayes Accuracy")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.grid()
bars = plt.bar(
    df_results["Model"], df_results["Accuracy"], color=["blue", "green", "red"]
)

# Add numbers over the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        yval,
        round(yval, 2),
        ha="center",
        va="bottom",
    )

plt.tight_layout()

# Save the plot
plt.savefig(os.path.join(GRAPH_PATH, "07-naive_bayes_accuracy_barplot.png"))
plt.show()

NameError: name 'rgb_acc' is not defined