In [1]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001


def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    prob = 1
    return prob


# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    """
    Gets probability of a word:
    Returns probability if word exists, EPSILON if not
    """
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    """
    Calculates word probabilities:
    1. Counts word frequencies
    2. Converts counts to probabilities
    Returns: word probability dictionary
    """
    wordMap, nWords = make_word_count_map(fileName)
    # print(fileName)
    # print("----------------------------")
    # print(wordMap)
    # print("----------------------------")
    # print(nWords)
    # print("----------------------------")
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    """
    Reads a file and counts word frequencies:
    1. Opens file
    2. Splits into words
    3. Standardizes each word
    4. Counts occurrences
    Returns: (wordMap, total word count)
    """
    wordMap = {}
    nWords = 0
    with open(fileName ,encoding="utf-8") as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    return wordMap, nWords

# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    """
    Updates word count in dictionary:
    1. Skips stop words
    2. Initializes count if new word
    3. Increments count if existing word
    """
    if is_stop(word):
        return
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1


def standardize(word):
    """
    Standardizes words by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Keeping only alphabetic characters
    """
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

def is_stop(word):
    """
    Removes common words that don't help in analysis
    """
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words


def main():
    # Calculate all the ps and qs
    # Eg zohairWordProb['congress'] = 0.005
    # zohair_word_prob['piech'] = 0.0
    # zohair_word_prob['the'] = 0.001

    hamilton_word_prob = make_word_prob_map('zohair.txt')
    madison_word_prob = make_word_prob_map('ateeb.txt')

    # print("********************************************")
    # print("zohair_word_prob", zohair_word_prob)
    # print("----------------------------------------------")
    # print("ateeb_word_prob", ateeb_word_prob)
    # print("--------------------------------------")
    # print("********************************************")

    # print(zohair_word_prob["independent"])
    # print(ateeb_word_prob["independent"])

    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    print("*" * 50)
    print(unknown_doc_count)
    print("*" * 50)
    print(n_words)


if __name__ == '__main__':
    main()

**************************************************
{'crime': 1, 'report': 1, 'midnight': 3, 'arsons': 3, '': 4, 'city': 2, 'brookdale': 2, 'has': 2, 'been': 2, 'rocked': 1, 'by': 2, 'a': 7, 'series': 1, 'deliberate': 1, 'fires': 3, 'set': 1, 'under': 1, 'cover': 1, 'darkness': 1, 'over': 1, 'past': 1, 'four': 1, 'months': 1, 'at': 3, 'least': 1, 'eight': 1, 'buildingsranging': 1, 'from': 2, 'abandoned': 1, 'warehouses': 1, 'occupied': 1, 'residential': 1, 'complexeshave': 1, 'gone': 1, 'up': 1, 'in': 1, 'flames': 1, 'investigators': 1, 'have': 3, 'identified': 1, 'pattern': 1, 'accelerants': 1, 'found': 1, 'scene': 1, 'fire': 3, 'alarms': 1, 'mysteriously': 1, 'disabled': 1, 'security': 1, 'footage': 1, 'corrupted': 1, 'beyond': 1, 'retrieval': 1, 'these': 2, 'calculated': 1, 'actions': 1, 'suggest': 1, 'skilled': 1, 'arsonist': 2, 'with': 2, 'deep': 1, 'understanding': 1, 'both': 1, 'technology': 1, 'one': 1, 'most': 1, 'devastating': 1, 'incidents': 1, 'occurred': 1, 'historic': 1, '

In [3]:
import csv
import operator
import math

EPSILON = 0.000001  # Small probability for unseen words


def calc_term_doc_given_author(prob_map, counts):
    """
    Computes the probability of the document being written by a specific author.
    Uses the independent word assumption (Bag of Words model).
    """
    prob = 1.0
    for word, count in counts.items():
        p_word = get_word_prob(prob_map, word)
        prob *= p_word ** count  # Probability raised to count frequency
    return prob


def get_word_prob(word_prob_map, word):
    """
    Returns the probability of a word occurring in an author's vocabulary.
    If the word is not found, return a small probability (EPSILON).
    """
    return word_prob_map.get(word, EPSILON)


def make_word_prob_map(file_name):
    """
    Reads a file and calculates word probabilities.
    Returns a dictionary where keys are words and values are probabilities.
    """
    word_map, total_words = make_word_count_map(file_name)
    probability_map = {word: count / total_words for word, count in word_map.items()}
    return probability_map


def make_word_count_map(file_name):
    """
    Reads a file and counts word frequencies.
    Returns: 
        - wordMap (dictionary with word counts)
        - total word count
    """
    word_map = {}
    total_words = 0

    try:
        with open(file_name, encoding="utf-8") as f:
            for line in f:
                words = line.split()
                for word in words:
                    word = standardize(word)
                    if word:  # Ensure non-empty word
                        add_word_to_count_map(word_map, word)
                        total_words += 1
    except FileNotFoundError:
        print(f"Error: File '{file_name}' not found.")
        return {}, 0  # Return empty dictionary and zero words

    return word_map, total_words


def add_word_to_count_map(word_map, word):
    """
    Updates the word count in the given dictionary.
    """
    if is_stop(word):
        return
    word_map[word] = word_map.get(word, 0) + 1


def standardize(word):
    """
    Cleans and standardizes words:
        - Converts to lowercase
        - Removes punctuation
        - Keeps only alphabetic characters
    """
    return ''.join(char for char in word.lower().strip() if char.isalpha())


def is_stop(word):
    """
    Filters out common stop words to improve classification accuracy.
    """
    stop_words = {'to', 'i', 'the', 'and', 'of'}
    return word in stop_words


def main():
    # Load probability maps for each author
    zohair_word_prob = make_word_prob_map('zohair.txt')
    ateeb_word_prob = make_word_prob_map('ateeb.txt')

    # Load unknown document for classification
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    if n_words == 0:
        print("Error: Unknown document is empty or missing.")
        return

    # Safely access word probability without KeyError
    print("zohair['midnight']\t", zohair_word_prob.get('midnight', 'Not Found'))
    print("ateeb['midnight']\t", ateeb_word_prob.get('midnight', 'Not Found'))
    print("doc_count['midnight']\t", unknown_doc_count.get('midnight', 'Not Found'))

    print("Total words in unknown document:", n_words)

    # Compute likelihood of each author having written the document
    zohair_term = calc_term_doc_given_author(zohair_word_prob, unknown_doc_count)
    ateeb_term = calc_term_doc_given_author(ateeb_word_prob, unknown_doc_count)

    print("===================================")
    print("Likelihood Scores:")
    print(f"Zohair's probability: {zohair_term}")
    print(f"Ateeb's probability: {ateeb_term}")

    # Determine the most likely author
    if zohair_term > ateeb_term:
        print("The document was most likely written by Zohair.")
    elif ateeb_term > zohair_term:
        print("The document was most likely written by Ateeb.")
    else:
        print("The document's author is inconclusive.")

if __name__ == '__main__':
    main()


zohair['midnight']	 Not Found
ateeb['midnight']	 Not Found
doc_count['midnight']	 3
Total words in unknown document: 259
Likelihood Scores:
Zohair's probability: 0.0
Ateeb's probability: 0.0
The document's author is inconclusive.
