In [1]:
import os
import zipfile
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer # BoW vectorization
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashut\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Define the path to the "archive.zip" file
archive_path = './archive.zip'

# Extract the data from the ZIP file
with zipfile.ZipFile(archive_path, 'r') as zip_ref:
    zip_ref.extractall('extracted_data')
    

# Initialize an empty set for the vocabulary
vocabulary = set()

# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))

# Recursive function to process files in a directory
def process_directory(directory):
    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', errors='ignore') as f:
                content = f.read()
                # Convert text to lowercase, remove punctuation, and split into words
                words = content.lower().translate(translator).split()
                words = [word for word in words if word.lower() not in stop_words]

                # Add unique words to the vocabulary set
                vocabulary.update(words)

# Process the data directory to build the vocabulary
process_directory('extracted_data')

# Convert the vocabulary set into a dictionary that maps words to indices
vocabulary_dict = {word: idx for idx, word in enumerate(sorted(vocabulary))}




In [3]:
print(len(vocabulary))

33494


In [4]:
#For Bernaulli and Poisson distributions
# Load and preprocess the data while mapping words to their indices
def load_and_preprocess_data(data_path, vocabulary_dict):
    data_bern = []
    data_pois = []
    data_multi = []
    labels = []

    newsgroup_folders = os.listdir(data_path)
    label_to_index = {folder: i for i, folder in enumerate(newsgroup_folders)}

    for folder in newsgroup_folders:
        folder_path = os.path.join(data_path, folder)
        files = os.listdir(folder_path)

        for file in files:
            file_path = os.path.join(folder_path, file)
            with open(file_path, 'r', errors='ignore') as f:
                content = f.read()
                # Tokenize the content into words and convert to lowercase
                words = content.lower().translate(translator).split()
                words = [word for word in words if word.lower() not in stop_words]
                data_curr = np.full(len(vocabulary), 0)
                data_currp = np.full(len(vocabulary), 0)
                data_currm = np.full(len(words), -1)
                for j, word in enumerate(words):
                    i = vocabulary_dict.get(word, -1)
                    data_currm[j] = i
                    # print(i)
                    if i != -1 :
                        data_curr[i] += 1
                        data_currp[i] = 1
                data_bern.append(data_curr)
                data_pois.append(data_currp)
                data_multi.append(data_currm)
                labels.append(label_to_index[folder])

    return data_bern, data_pois, data_multi, labels

# Load and preprocess the data while mapping words to their indices using vocabulary_dict
data_bern, data_pois, data_multi, labels = load_and_preprocess_data('extracted_data', vocabulary_dict)


Xb_train, Xb_test, yb_train, yb_test = train_test_split(data_bern, labels, test_size=0.2)
Xp_train, Xp_test, yp_train, yp_test = train_test_split(data_pois, labels, test_size=0.2)
Xm_train, Xm_test, ym_train, ym_test = train_test_split(data_multi, labels, test_size=0.2)



In [5]:
# print(data_pois)
# print(len(data_bern[0]))
# # Count the number of 1s in data[0]
# def count_ones(i):
#     return np.count_nonzero(data_pois[i] == 0)
# print(f"Number of nonzeroes in datapois[0]: {len(data_pois[2])} - {count_ones(2)} = {len(data_pois[2]) - count_ones(2)}")
# print(f"Number of nonzeroes in datapois[0]: {len(data_pois[3])} - {count_ones(3)} = {len(data_pois[3]) - count_ones(3)}")
# print(f"Number of nonzeroes in datapois[0]: {len(data_pois[4])} - {count_ones(4)} = {len(data_pois[4]) - count_ones(4)}")
# print(f"Number of nonzeroes in datapois[0]: {len(data_pois[5])} - {count_ones(5)} = {len(data_pois[5]) - count_ones(5)}")
# print(len(data_multi[0]))
# print(len(data_multi[1]))
# print(len(data_multi[2]))
# print(len(data_multi[3]))
# count_ones = np.count_nonzero(data_multi[1] == 0)
# print(f"Number of nonzeroes in datamulti[0]: {len(data_multi[1])} - {count_ones} = {len(data_multi[1]) - count_ones}")
# # print(Xb_train, yb_train)

# print(Xm_train)

In [6]:
class NaiveBayes:
    
    def __init__(self, distribution='multinomial'):
        self.distribution = distribution
        self.class_probs = {}
        self.feature_probs = {}
    
    def train(self, X, y, alpha=1):
        
        # Calculate class probabilities
        total_samples = len(y)
        unique_classes = np.unique(y)
        for class_label in unique_classes:
            class_count = np.sum(y == class_label)
            self.class_probs[class_label] = class_count / total_samples
    
            # Calculate feature probabilities based on distribution
            if self.distribution == 'bernoulli':
                theta_j = []
                for j in range(len(vocabulary_dict)):
                    numerator = 0
                    for i in range(len(X)):
                        if y[i] == class_label:
                            numerator += X[i][j]
                    theta_j.append((numerator + 1*alpha) / (class_count + 33000*alpha))  
                self.feature_probs[class_label] = theta_j
                
            elif self.distribution == 'poisson':
                lambda_j = []
                for j in range(len(vocabulary_dict)):
                    numerator = 0
                    for i in range(len(X)):
                        if y[i] == class_label:
                            numerator += X[i][j]
                    lambda_j.append((numerator + 1*alpha) / (class_count + 33000*alpha))  
                self.feature_probs[class_label] = lambda_j
                
            elif self.distribution == 'multinomial':
                theta_jl = []
                for l in range(len(vocabulary_dict)):  # Assuming you have a vocabulary_dict
                    numerator = 0
                    denominator = 0
                    
                    for i in range(len(X)):
                        if y[i] == class_label:
                            # Count occurrences of the word in documents of this class
                            word_count = np.sum(X[i] == l)
                            numerator += word_count
                            denominator += len(X[i])

                    # Apply Laplace smoothing to avoid zero probabilities
                    smoothed_prob = (numerator + 1*alpha) / (denominator + (len(vocabulary_dict) * alpha))
                    theta_jl.append(smoothed_prob)
                
                self.feature_probs[class_label] = theta_jl
          
    
    def predict(self, X):
        predictions = []

        if self.distribution == 'bernoulli':
            for sample in X:
                # Calculate class probabilities using feature probabilities and class probabilities
                class_scores = {}  # Dictionary to store scores for each class
                for class_label, class_prob in self.class_probs.items():
                    score = np.log(class_prob)  # Initialize with class probability
                    for j, x_j in enumerate(sample):
                        if x_j == 1:
                            # Add the log-probability for feature j being 1
                            score += np.log(self.feature_probs[class_label][j])
                        else:
                            # Add the log-probability for feature j being 0 (1 - the probability of being 1)
                            score += np.log(1 - self.feature_probs[class_label][j])
                    class_scores[class_label] = score

                # Select the class with the highest score
                predicted_class = max(class_scores, key=class_scores.get)
                predictions.append(predicted_class)

        elif self.distribution == 'poisson':
            for sample in X:
                # Calculate class probabilities using feature probabilities and class probabilities
                class_scores = {}  # Dictionary to store scores for each class
                for class_label, class_prob in self.class_probs.items():
                    score = np.log(class_prob)  # Initialize with class probability
                    for j, x_j in enumerate(sample):
                        # Calculate the Poisson probability for each feature j
                        # Assuming self.feature_probs stores λ values
                        poisson_prob = np.exp(-self.feature_probs[class_label][j]) * (self.feature_probs[class_label][j] ** x_j) / np.math.factorial(x_j)
                        score += np.log(poisson_prob)
                    class_scores[class_label] = score

                # Select the class with the highest score
                predicted_class = max(class_scores, key=class_scores.get)
                predictions.append(predicted_class)

        elif self.distribution == 'multinomial':
            for sample in X:
                # Calculate class probabilities using feature probabilities and class probabilities
                class_scores = {}  # Dictionary to store scores for each class
                for class_label, class_prob in self.class_probs.items():
                    score = np.log(class_prob)  # Initialize with class probability
                    for j, x_j in enumerate(sample):
                        # Assuming self.feature_probs stores word probabilities for each class
                        word_prob = self.feature_probs[class_label][x_j]
                        score += np.log(word_prob)
                    class_scores[class_label] = score

                # Select the class with the highest score
                predicted_class = max(class_scores, key=class_scores.get)
                predictions.append(predicted_class)

        return predictions



In [7]:
nb_classifier_b = NaiveBayes(distribution='bernoulli',)
nb_classifier_b.train(Xb_train, yb_train)

In [None]:
# print(nb_classifier_b.class_probs)

In [None]:
# print(nb_classifier_b.feature_probs)

In [None]:
predictions_b = nb_classifier_b.predict(Xb_test)

In [None]:

predictions_b = np.array(predictions_b)
yb_test = np.array(yb_test)
# Assuming y_pred and y_actual are NumPy arrays or lists
accuracy = np.mean(predictions_b == yb_test)
print(f"Accuracy for Bernoulli Disribution: {accuracy}")
# predictions_b


Accuracy for Bernoulli Disribution: 0.815


In [None]:
nb_classifier_p = NaiveBayes(distribution='poisson')
nb_classifier_p.train(Xp_train, yp_train)

In [None]:
# print(nb_classifier_p.class_probs)

In [None]:
# print(nb_classifier_p.feature_probs)

In [None]:
predictions_p = nb_classifier_p.predict(Xp_test)

  poisson_prob = np.exp(-self.feature_probs[class_label][j]) * (self.feature_probs[class_label][j] ** x_j) / np.math.factorial(x_j)


In [None]:

predictions_p = np.array(predictions_p)
print(len(predictions_p))
yp_test = np.array(yp_test)
accuracy = np.mean(predictions_p == yp_test)
print(f"Accuracy for Poisson Distribution: {accuracy}")
# predictions_p

200
Accuracy for Poisson Distribution: 0.87


In [None]:
nb_classifier_m = NaiveBayes(distribution='multinomial')
nb_classifier_m.train(Xm_train, ym_train)

In [None]:
# print(nb_classifier_m.class_probs)

In [None]:
# print(nb_classifier_m.feature_probs)

In [None]:
predictions_m = nb_classifier_m.predict(Xm_test)

In [None]:
predictions_m = np.array(predictions_m)
ym_test = np.array(ym_test)
# Assuming y_pred and y_actual are NumPy arrays or lists
accuracy = np.mean(predictions_m == ym_test)
print(f"Accuracy for Multinoulli Distribution: {accuracy}")
# predictions_m

Accuracy for Multinoulli Distribution: 0.985
