In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from typing import List, Dict, Tuple
nltk.download('punkt')
#"HP1", "HP2", "HP3", "HP4",
y_book_array = ["HP1", "HP2", "HP3", "HP4", "HP5", "HP6", "HP7"]


# Function to read and preprocess text
def read_and_preprocess(files):
    corpus = []
    book_tokens = []
   
    for file in files:
        with open(file, 'r') as f:
            text = f.read().lower()
            # Split text into pages
            pages = text.split('\n')
            
            book_pages_tokens = []
            for page in pages:
                # Remove punctuation
                page = re.sub(r'[^\w\s]', '', page)
                # Tokenize
                tokens = word_tokenize(page)
                corpus.extend(tokens)
                book_pages_tokens.append(tokens)
            
            book_tokens.append(book_pages_tokens)
    return corpus, book_tokens

# Specify the files for the seven books
files = ['HarryPotter/HP1.txt', 'HarryPotter/HP2.txt', 'HarryPotter/HP3.txt', 'HarryPotter/HP4.txt','HarryPotter/HP5.txt', 'HarryPotter/HP6.txt', 'HarryPotter/HP7.txt']
#files = ['HarryPotter/HP4.txt','HarryPotter/HP5.txt', 'HarryPotter/HP6.txt', 'HarryPotter/HP7.txt']
# Preprocess the text
# tokens is now a 3d array with each sub array has sub-subarrays that are each page for each book
corpus, tokens = read_and_preprocess(files)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [2]:
def extract_ngrams_from_tokens(tokens, book,  n):
    # Step 1: Initialize an empty list to store N-grams
    n_grams = []
    y_array = []
    # Step 2: Generate N-grams
    for i in range(len(tokens) - n + 1):
        n_gram = ' '.join(tokens[i:i+n])
        n_grams.append(n_gram)
        y_array.append(book)
    return n_grams, y_array

for i in range(len(files)):
    if i ==0:
        pages_trigramed = []
        pages_y_array = []
        # for each page 
        for j in range(len(tokens[i])): 
            trigrams, trigrams_y_array = extract_ngrams_from_tokens(tokens[i][j],y_book_array[i], 2)
            pages_trigramed.append(trigrams)
            pages_y_array.append(y_book_array[i])
        x_y_trigrams = pd.DataFrame({'trigrams': pages_trigramed, 'book': pages_y_array})
    else:
        pages_trigramed = []
        pages_y_array = []
        for j in range(len(tokens[i])): 
            trigrams, trigrams_y_array = extract_ngrams_from_tokens(tokens[i][j],y_book_array[i], 2)
            pages_trigramed.append(trigrams)
            pages_y_array.append(y_book_array[i])
        book_x_y_trigrams = pd.DataFrame({"trigrams": pages_trigramed, "book": pages_y_array})
        x_y_trigrams = pd.concat([x_y_trigrams, book_x_y_trigrams], ignore_index=True)
    
display(x_y_trigrams)

Unnamed: 0,trigrams,book
0,"[mr and, and mrs, mrs dursley, dursley of, of ...",HP1
1,"[met for, for several, several years, years in...",HP1
2,"[the cat, cat it, it stared, stared back, back...",HP1
3,"[calls and, and shouted, shouted a, a bit, bit...",HP1
4,"[he found, found it, it a, a lot, lot harder, ...",HP1
...,...,...
4702,"[youre right, right sorry, sorry said, said ro...",HP7
4703,"[no said, said harry, harry firmly, firmly you...",HP7
4704,"[ginny kissed, kissed albus, albus goodbye, go...",HP7
4705,"[he had, had never, never told, told any, any ...",HP7


In [3]:
from collections import Counter
X_train_val, X_test, y_train_val, y_test = train_test_split(x_y_trigrams["trigrams"], x_y_trigrams["book"], test_size=0.1, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)


In [4]:
from sklearn.utils.class_weight import compute_class_weight

def calculate_class_weights(y_train):
    classes = np.unique(y_train)
    class_weights = compute_class_weight(None, classes=classes, y=y_train)
    return dict(zip(classes, class_weights))

# Example usage
class_weights = calculate_class_weights(y_train)
print(class_weights)


{'HP1': 1.0, 'HP2': 1.0, 'HP3': 1.0, 'HP4': 1.0, 'HP5': 1.0, 'HP6': 1.0, 'HP7': 1.0}


In [5]:
from collections import Counter

#Calculating the priors
def get_priors(y_train):
    book_counts = Counter(y_train)
    total_count = len(y_train)
    priors = {}
    for i in book_counts:
        priors.update({i: book_counts[i]/total_count})
    return priors

#calculating the occurences

def get_occurrences(n_gram: str, x_train: List[List[str]], y_train: List[str], classes: List[str]) -> Dict[str, int]:
    occurrences = {c: 0 for c in classes}
    for trigrams, book in zip(x_train, y_train):
        if n_gram in trigrams:
            occurrences[book] += 1
    return occurrences
    
from collections import defaultdict
import numpy as np

def get_likelihood(book: str, n_gram: str, n_gram_map: Dict[str, Dict[str, int]], total_trigrams_per_class: Dict[str, int], vocab_size: int) -> float:
    delta = 1.0 # Laplace smoothing
    
    numerator = delta + n_gram_map.get(n_gram, {}).get(book, 0)
    denominator = delta * vocab_size + total_trigrams_per_class[book]
    
    return numerator / denominator


def n_gram_mapping(x_train: List[List[str]], y_train: List[str], classes: List[str]) -> Tuple[Dict[str, Dict[str, int]], Dict[str, int]]:
    n_gram_map = defaultdict(lambda: defaultdict(int))
    total_trigrams_per_class = {c: 0 for c in classes}
    
    for trigrams, book in zip(x_train, y_train):
        for n_gram in trigrams:
            n_gram_map[n_gram][book] += 1
            total_trigrams_per_class[book] += 1
    
    return n_gram_map, total_trigrams_per_class


def prob_page_in_book(page, priors, n_gram_map, total_trigrams_per_class, classes, vocab_size, class_weights):
    log_probs = []
    
    for c in classes:
        #log_prob = np.log(priors[c])
        log_prob = np.log(priors[c]* class_weights[c])
        for n_gram in page:
            likelihood = get_likelihood(c, n_gram, n_gram_map, total_trigrams_per_class, vocab_size)
            log_prob += np.log(likelihood)
            
        log_probs.append(log_prob)
    return log_probs


def predict(X: List[List[str]], y: List[str], X_test: List[List[str]], classes: List[str]) -> List[str]:
    priors = get_priors(y)
    n_gram_map, total_trigrams_per_class = n_gram_mapping(X, y, classes)
    vocab_size = len(set(n_gram for page in X for n_gram in page))
    
    class_weights = calculate_class_weights(y)
    
    predictions = []
    for page in X_test:
        log_probs = prob_page_in_book(page, priors, n_gram_map, total_trigrams_per_class, classes, vocab_size, class_weights)
        predictions.append(classes[np.argmax(log_probs)])
    
    return predictions





In [6]:

predictions = predict(X_train, y_train, X_val, y_book_array)
accuracy = sum(p == a for p, a in zip(predictions,  y_val)) / len(y_val)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 70.49%


In [7]:
predictions = predict(X_train, y_train, X_test, y_book_array)
accuracy = sum(p == a for p, a in zip(predictions, y_test)) / len(y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.76%
