In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
nltk.download('punkt')

y_book_array = ["HP1", "HP2", "HP3", "HP4"]


# Function to read and preprocess text
def read_and_preprocess(files):
    corpus = []
    book_tokens = []
   
    for file in files:
        with open(file, 'r') as f:
            text = f.read().lower()
            # Split text into pages
            pages = text.split('\n')
            
            book_pages_tokens = []
            for page in pages:
                # Remove punctuation
                page = re.sub(r'[^\w\s]', '', page)
                # Tokenize
                tokens = word_tokenize(page)
                corpus.extend(tokens)
                book_pages_tokens.append(tokens)
            
            book_tokens.append(book_pages_tokens)
    return corpus, book_tokens

# Specify the files for the seven books
files = ['HarryPotter/HP1.txt', 'HarryPotter/HP2.txt', 'HarryPotter/HP3.txt', 'HarryPotter/HP4.txt']

# Preprocess the text
# tokens is now a 3d array with each sub array has sub-subarrays that are each page for each book
corpus, tokens = read_and_preprocess(files)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/ashleasmith/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
def extract_ngrams_from_tokens(tokens, book,  n):
    # Step 1: Initialize an empty list to store N-grams
    n_grams = []
    y_array = []
    # Step 2: Generate N-grams
    for i in range(len(tokens) - n + 1):
        n_gram = ' '.join(tokens[i:i+n])
        n_grams.append(n_gram)
        y_array.append(book)
    return n_grams, y_array

for i in range(len(files)):
    if i ==0:
        pages_trigramed = []
        pages_y_array = []
        # for each page 
        for j in range(len(tokens[i])): 
            trigrams, trigrams_y_array = extract_ngrams_from_tokens(tokens[i][j],y_book_array[i], 3)
            pages_trigramed.append(trigrams)
            pages_y_array.append(y_book_array[i])
        x_y_trigrams = pd.DataFrame({'trigrams': pages_trigramed, 'book': pages_y_array})
    else:
        pages_trigramed = []
        pages_y_array = []
        for j in range(len(tokens[i])): 
            trigrams, trigrams_y_array = extract_ngrams_from_tokens(tokens[i][j],y_book_array[i], 3)
            pages_trigramed.append(trigrams)
            pages_y_array.append(y_book_array[i])
        book_x_y_trigrams = pd.DataFrame({"trigrams": pages_trigramed, "book": pages_y_array})
        x_y_trigrams = pd.concat([x_y_trigrams, book_x_y_trigrams], ignore_index=True)
    
display(x_y_trigrams)

Unnamed: 0,trigrams,book
0,"[mr and mrs, and mrs dursley, mrs dursley of, ...",HP1
1,"[met for several, for several years, several y...",HP1
2,"[the cat it, cat it stared, it stared back, st...",HP1
3,"[calls and shouted, and shouted a, shouted a b...",HP1
4,"[he found it, found it a, it a lot, a lot hard...",HP1
...,...,...
2018,"[harry potter and, potter and the, and the gob...",HP4
2019,"[harry potter and, potter and the, and the gob...",HP4
2020,"[harry potter and, potter and the, and the gob...",HP4
2021,"[harry thanks george, thanks george muttered, ...",HP4


In [None]:

X_train_val, X_test, y_train_val, y_test = train_test_split(x_y_trigrams["trigrams"], x_y_trigrams["book"], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


<h2>Naive Bayes Classifier </h2>

In [None]:
from collections import Counter

#Calculating the priors
def get_priors(y_train):
    book_counts = Counter(y_train)
    total_count = len(y_train)
    priors = {i: book_counts[i]/total_count for i in book_counts}
    
    return priors

In [None]:
#calculating the occurences

def get_occurences(n_gram, x_train, y_train, classes):
    occurences = {i: 0 for i in classes}
    
    for trigrams, book in zip(x_train, y_train):
        if n_gram in trigrams:
            occurences[book] +=1
        return occurences
    


In [None]:
#caluclating the likelihood
#vocab size = len(page)
def get_likelihood(book_index,n_gram, n_gram_mapping, total_trigrams_per_class, vocab_size):
    delta = 0.1
    
    if n_gram in n_gram_mapping:
        numerator = delta + n_gram_mapping[n_gram][book_index]
        denominator = delta * vocab_size + total_trigrams_per_class[book_index]
    else:
        numerator = delta
        denominator = delta * vocab_size + total_trigrams_per_class[book_index]
    
    return numerator / denominator

In [None]:
#n-gram mapping -> mapping from each n_gram to its occurences count in each book and 
# calculates the total number of trigrams per class

def n_gram_mapping(x_train, y_train, classes):
    n_gram_mapping = {}
    total_trigrams_per_class = {c: 0 for c in classes}
    
    for trigrams, book in zip(x_train, y_train):
        for n_gram in trigrams:
            if n_gram not in n_gram_mapping:
                n_gram_mapping[n_gram] = get_occurences(n_gram, x_train, y_train, classes)
            total_trigrams_per_class[book] += 1
    return n_gram_mapping, total_trigrams_per_class
            

In [None]:
#calculating the probability of page being in a book

def prob_page_in_book(page, priors, n_gram_mapping, total_trigrams_per_class, classes, vocab_size):
    probs =[]
    
    for c in classes:
        prior = priors[c]
        likelihood = 1
        
        for n_gram in page:
            likelihood *= get_likelihood(c, n_gram, n_gram_mapping, total_trigrams_per_class, vocab_size)
        probs.append(prior * likelihood)
    return probs
        

SyntaxError: incomplete input (2250654496.py, line 3)

In [None]:
#one-hot encoding 
# used to identify the class with the highest probability

def convert_prob_to_one_hot(probs):
    probs = np.array(probs)
    
    one_hots = np.zeros(len(probs))
    one_hot_index = np.random.choice(np.where(probs == probs.max())[0])
    one_hots[one_hot_index] = 1
    return one_hots

#getting the actual one hot 

def get_actual_one_hot_encoding(index, length):
    one_hots = np.zeros(length)
    one_hots[index] = 1
    return one_hots


    

In [None]:
# split data into training and testing sets
def filter_books(x, y, train_ratio = 0.6):
    x_train, x_test = [], []
    y_train, y_test = [], []
    
    for i in range(len(x)):
        split_idx = int(len(x[i]) * train_ratio)
        X_train.append(x[i][:split_idx])
        y_train.append(y[i][:split_idx])
        X_test.append(x[i][split_idx:])
        y_test.append(y[i][split_idx:])
        
    X_train_flat = [item for sublist in x_train for item in sublist]
    y_train_flat = [item for sublist in y_train for item in sublist]
    X_test_flat = [item for sublist in x_test for item in sublist]
    y_test_flat = [item for sublist in y_test for item in sublist]
    
    return X_train_flat, X_test_flat, y_train_flat, y_test_flat
    
    

SyntaxError: incomplete input (3321048428.py, line 1)

In [None]:

X = x_y_trigrams['trigrams'].tolist()
y = x_y_trigrams['book'].tolist()

# Split the data
X_train, X_test, y_train, y_test = filter_books(X, y)

# Get priors and n-gram mappings
classes = list(set(y_train))
priors = get_priors(y_train)
ngram_mapping, total_trigrams_per_class = n_gram_mapping(X_train, y_train, classes)
vocab_size = len(set([ngram for trigrams in X_train for ngram in trigrams]))

# Predict for a test page
page = X_test[0]
probs = prob_page_in_book(page, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size)
one_hot_prediction = convert_prob_to_one_hot(probs)

# Compare with the actual label
actual_one_hot = get_actual_one_hot_encoding(y_test[0], len(classes))
print("Predicted:", one_hot_prediction)
print("Actual:", actual_one_hot)

In [None]:
def predict_page_class(page, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size):
    probs = prob_page_in_book(page, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size)
    return np.argmax(probs)  #return the index of the highest probability

def calculate_accuracy(X_test, y_test, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size):
    correct_predictions = 0
    total_predictions = len(X_test)

    for i, page in enumerate(X_test):
        predicted_class = predict_page_class(page, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size)
        actual_class = y_test[i]
        if predicted_class == actual_class:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

In [None]:
# Calculate accuracy on the test set
accuracy = calculate_accuracy(X_test, y_test, priors, ngram_mapping, total_trigrams_per_class, classes, vocab_size)
print(f"Model Accuracy: {accuracy * 100:.2f}%")