In [1]:
import os
import pandas as pd
import numpy as np
import re

# Define the path to your spam and ham folders for train and dev sets
train_spam_folder = '/Users/baljinnyam/Downloads/spam_data/train/spam'
train_ham_folder = '/Users/baljinnyam/Downloads/spam_data/train/ham'
dev_spam_folder = '/Users/baljinnyam/Downloads/spam_data/dev/spam'
dev_ham_folder = '/Users/baljinnyam/Downloads/spam_data/dev/ham'

# Define a function to read in the email text from a folder and return a dataframe of word counts
def create_word_counts_dataframe(folder_path):
    # Initialize an empty list to store the email text
    email_text = []

    # Loop through each file in the folder and append the contents to the email_text list
    for filename in os.listdir(folder_path):
        try:
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                email_text.append(f.read())
        except UnicodeDecodeError:
            print(f"Error: could not read file {filename} due to encoding issue")

    # Use the CountVectorizer from scikit-learn to transform the email text into a sparse matrix of word counts
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(email_text)

    # Create a pandas dataframe from the sparse matrix
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    return df

# Define a function to calculate the probability of each word appearing in a spam or non-spam email
def calculate_word_probabilities(spam_df, ham_df, a=1):
    # Calculate the total number of words in the spam and non-spam emails
    total_spam_words = spam_df.sum().sum()
    total_ham_words = ham_df.sum().sum()

    # Calculate the total number of emails in the spam and non-spam datasets
    num_spam_emails = spam_df.shape[0]
    num_ham_emails = ham_df.shape[0]

    # Calculate the probability of a spam email and a non-spam email
    p_spam = num_spam_emails / (num_spam_emails + num_ham_emails)
    p_ham = num_ham_emails / (num_spam_emails + num_ham_emails)

    # Calculate the probability of each word appearing in a spam email
    spam_word_probs = {}
    for word in spam_df.columns:
        count = spam_df[word].sum()
        prob = (count + a) / (total_spam_words + a * len(spam_df.columns))
        spam_word_probs[word] = prob

    # Calculate the probability of each word appearing in a non-spam email
    ham_word_probs = {}
    for word in ham_df.columns:
        count = ham_df[word].sum()
        prob = (count + a) / (total_ham_words + a * len(ham_df.columns))
        ham_word_probs[word] = prob

    return p_spam, p_ham, spam_word_probs, ham_word_probs

# Create dataframes for the train_spam and train_ham folders
train_spam_df = create_word_counts_dataframe(train_spam_folder)
train_ham_df = create_word_counts_dataframe(train_ham_folder)

# Create dataframes for the dev_spam and dev_ham folders
dev_spam_df = create_word_counts_dataframe(dev_spam_folder)
dev_ham_df = create_word_counts_dataframe(dev_ham_folder)

# Calculate the probability
p_spam, p_ham, spam_word_probs, ham_word_probs = calculate_word_probabilities(train_spam_df, train_ham_df)

# Define a function to calculate the probability of an email being spam or non-spam
def calculate_email_probabilities(email_df, p_spam, p_ham, spam_word_probs, ham_word_probs):
    # Calculate the probability of each email being spam or non-spam
    email_probs = []
    for index, row in email_df.iterrows():
        # Initialize the probability of spam and non-spam
        p_email_spam = p_spam
        p_email_ham = p_ham

        # Loop through each word in the email and multiply the probability of the word appearing in spam or non-spam emails
        for word in row.index:
            if row[word] > 0:
                p_email_spam += np.log(spam_word_probs[word])
                p_email_ham += np.log(ham_word_probs[word])

        # Calculate the probability of the email being spam or non-spam
        p_email = p_email_spam / (p_email_spam + p_email_ham)

        # Append the probability of the email being spam to the list
        email_probs.append(p_email)

    return email_probs

# Calculate the probability of each email in the dev_spam and dev_ham folders being spam
dev_spam_probs = calculate_email_probabilities(dev_spam_df, p_spam, p_ham, spam_word_probs, ham_word_probs)
dev_ham_probs = calculate_email_probabilities(dev_ham_df, p_spam, p_ham, spam_word_probs, ham_word_probs)
print(dev_spam_probs)
print(dev_ham_probs)

# Define a function to calculate the accuracy of the model
def calculate_accuracy(email_probs, threshold):
    # Initialize the number of correct predictions
    num_correct = 0

    # Loop through each probability and compare it to the threshold
    for prob in email_probs:
        if prob >= threshold:
            num_correct += 1

    # Calculate the accuracy
    accuracy = num_correct / len(email_probs)

    return accuracy

# Calculate the accuracy of the model
accuracy = calculate_accuracy(dev_spam_probs + dev_ham_probs, 0.5)
print(accuracy)



ModuleNotFoundError: No module named 'pandas'