In [2]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001


def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    prob = 1
    for word in counts:
        word_prob = get_word_prob(prob_map, word)
        word_count = counts[word]
        prob *= word_prob ** word_count  # Multiply probabilities based on word frequency
    return prob


# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    """
    Gets probability of a word:
    Returns probability if word exists, EPSILON if not
    """
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON


# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    """
    Calculates word probabilities:
    1. Counts word frequencies
    2. Converts counts to probabilities
    Returns: word probability dictionary
    """
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap


# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    """
    Reads a file and counts word frequencies:
    1. Opens file
    2. Splits into words
    3. Standardizes each word
    4. Counts occurrences
    Returns: (wordMap, total word count)
    """
    wordMap = {}
    nWords = 0
    with open(fileName ,encoding="utf-8") as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    return wordMap, nWords


# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    """
    Updates word count in dictionary:
    1. Skips stop words
    2. Initializes count if new word
    3. Increments count if existing word
    """
    if is_stop(word):
        return
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1


def standardize(word):
    """
    Standardizes words by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Keeping only alphabetic characters
    """
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard


def is_stop(word):
    """
    Removes common words that don't help in analysis
    """
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words


def main():
    # Calculate all the ps and qs
    # Eg zohairWordProb['congress'] = 0.005
    # zohair_word_prob['piech'] = 0.0
    # zohair_word_prob['the'] = 0.001

    zohair_word_prob = make_word_prob_map('zohair.txt')
    ateeb_word_prob = make_word_prob_map('ateeb.txt')

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')

    # Check if 'congress' exists in the probability maps before printing
    print("zohair['congress']\t", zohair_word_prob.get('congress', EPSILON))  # Use get() to avoid KeyError
    print("ateeb['congress']\t", ateeb_word_prob.get('congress', EPSILON))  # Use get() to avoid KeyError
    print("doc_count['congress']\t", unknown_doc_count.get('congress', 0))  # Use get() to avoid KeyError
    print("n_words", n_words)

    zohair_term = calc_term_doc_given_author(zohair_word_prob, unknown_doc_count)
    print('---'*10)
    ateeb_term = calc_term_doc_given_author(ateeb_word_prob, unknown_doc_count)
    print("Zohair Term\t", zohair_term)
    print("Ateeb Term\t", ateeb_term)


if __name__ == '__main__':
    main()


zohair['congress']	 1e-06
ateeb['congress']	 1e-06
doc_count['congress']	 0
n_words 263
------------------------------
Zohair Term	 0.0
Ateeb Term	 0.0


In [3]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001


def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    prob = 1
    for word, c_i in counts.items():
        p_word = get_word_prob(prob_map, word)
        prob *= p_word ** c_i
        print("word: ", word, "| prob_word: ", p_word, "| c_i: ", c_i, "| prob: ", prob)
    return prob


# If a word is in a probability dictionary, return its probability
# otherwise, return epsilon
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON


# From a file name, approximate the probability of a word
# being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless
# of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap


# From a file name, count the number of times each word exists
# in that file. Return the result as a map (aka a dictionary)
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    with open(fileName, encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords += 1
    return wordMap, nWords


# Add a word to a count map. Makes sure not to crash if the
# word has not been seen before.
def add_word_to_count_map(wordMap, word):
    if is_stop(word):
        return
    if word not in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1


# Standardizes a word. For now, we are just going to make it
# lower case.
def standardize(word):
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard


def is_stop(word):
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words


def main():
    # Calculate all the ps and qs
    # Eg zohairWordProb['judiciary'] = 0.005
    # zohair_word_prob['arson'] = 0.0
    # zohair_word_prob['silent'] = 0.001

    zohair_word_prob = make_word_prob_map('zohair.txt')  # Updated file name
    ateeb_word_prob = make_word_prob_map('ateeb.txt')  # Updated file name

    # Get the word count of the unknown document
    # Eg unknown_doc_count['judiciary'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')  # Updated file name

    # Change 'judiciary' or 'arson' based on the words found in the text files
    test_word = 'judiciary'  # You can change this word based on your requirement
    
    print(f"zohair['{test_word}']\t", zohair_word_prob.get(test_word, EPSILON))  # Use get() to avoid KeyError
    print(f"ateeb['{test_word}']\t", ateeb_word_prob.get(test_word, EPSILON))  # Use get() to avoid KeyError
    print(f"doc_count['{test_word}']\t", unknown_doc_count.get(test_word, 0))  # Use get() to avoid KeyError
    print("n_words", n_words)

    zohair_term = calc_term_doc_given_author(zohair_word_prob, unknown_doc_count)
    print('---' * 10)
    ateeb_term = calc_term_doc_given_author(ateeb_word_prob, unknown_doc_count)
    print(f"Zohair Term\t", zohair_term)
    print(f"Ateeb Term\t", ateeb_term)


if __name__ == '__main__':
    main()


zohair['judiciary']	 1e-06
ateeb['judiciary']	 0.01384083044982699
doc_count['judiciary']	 0
n_words 263
word:  crime | prob_word:  0.003115264797507788 | c_i:  1 | prob:  0.003115264797507788
word:  report | prob_word:  0.003115264797507788 | c_i:  1 | prob:  9.70487475859124e-06
word:  midnight | prob_word:  1e-06 | c_i:  3 | prob:  9.704874758591239e-24
word:  arsons | prob_word:  1e-06 | c_i:  3 | prob:  9.704874758591237e-42
word:   | prob_word:  0.01557632398753894 | c_i:  4 | prob:  5.712810560841155e-49
word:  city | prob_word:  1e-06 | c_i:  2 | prob:  5.7128105608411546e-61
word:  brookdale | prob_word:  1e-06 | c_i:  2 | prob:  5.712810560841154e-73
word:  has | prob_word:  0.003115264797507788 | c_i:  2 | prob:  5.544211101252078e-78
word:  been | prob_word:  0.003115264797507788 | c_i:  2 | prob:  5.380587437284264e-83
word:  rocked | prob_word:  1e-06 | c_i:  1 | prob:  5.3805874372842635e-89
word:  by | prob_word:  1e-06 | c_i:  2 | prob:  5.380587437284263e-101
word:  a

In [4]:
import csv
from itertools import count
import operator
import math

EPSILON = 0.000001

def main():
    # Calculate all the ps and qs for each author
    # Example: zohairWordProb['judiciary'] = 0.005
    # ateeb_word_prob['silent'] = 0.0
    # zohair_word_prob['arson'] = 0.001

    zohair_word_prob = make_word_prob_map('zohair.txt')  # Changed to 'zohair.txt'
    ateeb_word_prob = make_word_prob_map('ateeb.txt')    # Changed to 'ateeb.txt'

    # Get the word count of the unknown document
    # Example: unknown_doc_count['judiciary'] = 5
    unknown_doc_count, n_words = make_word_count_map('unknown.txt')  # Changed to 'unknown.txt'

    # Select a word to test against, for instance 'judiciary' or 'arson'
    test_word = 'judiciary'  # Change this to any word relevant to your text files

    # Print out word probabilities and document counts
    print(f"zohair['{test_word}']\t", zohair_word_prob.get(test_word, EPSILON))  # Avoiding KeyError
    print(f"ateeb['{test_word}']\t", ateeb_word_prob.get(test_word, EPSILON))  # Avoiding KeyError
    print(f"doc_count['{test_word}']\t", unknown_doc_count.get(test_word, 0))  # Avoiding KeyError
    print("n_words", n_words)

    # Calculate log probabilities for both authors
    zohair_term = calc_log_pr_doc_given_author(zohair_word_prob, unknown_doc_count)
    print('---' * 10)
    ateeb_term = calc_log_pr_doc_given_author(ateeb_word_prob, unknown_doc_count)

    # Print out the calculated log probabilities
    print(f"Log P(D|Zohair)\t", zohair_term)
    print(f"Log P(D|Ateeb)\t", ateeb_term)

    # Calculate and print the difference
    print(f"diff\t", zohair_term - ateeb_term)

def calc_log_pr_doc_given_author(prob_map, counts):
    """
    Calculate the log probability of the document, given the counts of words in the doc
    and the author's probability map.
    """
    log_prob = math.log(1)  # Start with the log of 1, since we are multiplying probabilities
    for word_i, c_i in counts.items():
        p_i = get_word_prob(prob_map, word_i)
        log_prob += c_i * math.log(p_i)
    return log_prob

# If a word is in a probability dictionary, return its probability,
# otherwise, return epsilon (a small probability to avoid zero probability)
def get_word_prob(word_prob_map, word):
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON

# Approximate the probability of a word being generated from the same distribution as the file.
# Assume that each word is produced independently, regardless of order.
def make_word_prob_map(fileName):
    wordMap, nWords = make_word_count_map(fileName)
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

# From a file name, count the number of times each word exists in that file.
# Return the result as a map (i.e., a dictionary).
def make_word_count_map(fileName):
    wordMap = {}
    nWords = 0
    with open(fileName, encoding='utf-8') as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords += 1
    return wordMap, nWords

# Add a word to a count map. Ensures not to crash if the word hasn't been seen before.
def add_word_to_count_map(wordMap, word):
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1

# Standardizes a word. Converts it to lowercase and removes any punctuation.
def standardize(word):
    standard = word.lower().strip()
    # Remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

if __name__ == '__main__':
    main()


zohair['judiciary']	 1e-06
ateeb['judiciary']	 0.01384083044982699
doc_count['judiciary']	 0
n_words 263
------------------------------
Log P(D|Zohair)	 -2611.2663936765807
Log P(D|Ateeb)	 -2876.146408718518
diff	 264.88001504193744
