In [1]:
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
nltk.download('punkt')

# Function to read and preprocess text
def read_and_preprocess(files):
    corpus = []
    for file in files:
        with open(file, 'r') as f:
            text = f.read().lower()
            # Remove punctuation
            text = re.sub(r'[^\w\s]', '', text)
            # Tokenize
            tokens = word_tokenize(text)
            corpus.extend(tokens)
    return corpus

# Specify the files for the seven books
files = ['harry_potter(1)/HP1.txt', 'harry_potter(1)/HP2.txt', 'harry_potter(1)/HP3.txt', 'harry_potter(1)/HP4.txt', 'harry_potter(1)/HP5.txt', 'harry_potter(1)/HP6.txt', 'harry_potter(1)/HP7.txt']

# Preprocess the text
corpus = read_and_preprocess(files)


[nltk_data] Downloading package punkt to /Users/manisha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Create vocabulary and mappings
unique_words = list(set(corpus))
word_to_idx = {word: i for i, word in enumerate(unique_words)}
idx_to_word = {i: word for i, word in enumerate(unique_words)}

# Label encoding for corpus
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(corpus)

# One-hot encoding for integer encoded corpus
onehot_encoder = OneHotEncoder(sparse_output=False, categories='auto')
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)

In [3]:
# Create training data
def generate_training_data(corpus, word_to_idx, window_size):
    X, y = [], []
    for i in range(len(corpus)):
        target_word = word_to_idx[corpus[i]]
        context_words = []
        for j in range(-window_size, window_size + 1):
            if j == 0 or i + j < 0 or i + j >= len(corpus):
                continue
            context_word = word_to_idx[corpus[i + j]]
            context_words.append(context_word)
        X.extend([target_word] * len(context_words))
        y.extend(context_words)
    return np.array(X), np.array(y)

window_size = 1
X, y = generate_training_data(corpus, word_to_idx, window_size)


In [4]:
# Parameters
vocab_size = len(unique_words)
embedding_dim = 100  # Dimension of the embedding vectors

# Define the model
input_layer = tf.keras.layers.Input(shape=(1,))
embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1)(input_layer)
flatten_layer = tf.keras.layers.Flatten()(embedding_layer)
output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')(flatten_layer)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model.fit(X_train, y_train, epochs=50, batch_size=40, validation_data=(X_test, y_test))

print("Model training completed.")


Epoch 1/50




[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m518s[0m 11ms/step - loss: 6.6995 - val_loss: 6.1416
Epoch 2/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m559s[0m 11ms/step - loss: 6.0079 - val_loss: 6.0751
Epoch 3/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m637s[0m 13ms/step - loss: 5.8778 - val_loss: 6.0864
Epoch 4/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m629s[0m 13ms/step - loss: 5.8123 - val_loss: 6.1211
Epoch 5/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m612s[0m 12ms/step - loss: 5.7658 - val_loss: 6.1489
Epoch 6/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m612s[0m 12ms/step - loss: 5.7187 - val_loss: 6.1729
Epoch 7/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m611s[0m 12ms/step - loss: 5.6814 - val_loss: 6.1889
Epoch 8/50
[1m49092/49092[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m615s[0m 13ms/step - loss: 5.6504 - val_loss: 6.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a model to extract embeddings
embedding_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(index=1).output)
embeddings = embedding_model.get_weights()[0]  # Get the weights of the embedding layer

# Function to get embedding for a word
def get_embedding(word):
    word_idx = word_to_idx[word]
    return embeddings[word_idx]

# Compute similarity between the target word and all other words
def get_most_similar_words(target_word, top_n=10):
    target_embedding = get_embedding(target_word)
    similarities = []
    
    for idx, word in idx_to_word.items():
        if word == target_word:
            continue
        word_embedding = embeddings[idx]
        similarity = cosine_similarity([target_embedding], [word_embedding])[0][0]
        similarities.append((word, similarity))
    
    # Sort by similarity
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Example usage
target_word = "harry"
top_related_words = get_most_similar_words(target_word, top_n=10)

print(f"Top related words to '{target_word}':")
for word, similarity in top_related_words:
    print(f"{word}: {similarity:.4f}")


Top related words to 'harry':
hermione: 0.6103
ron: 0.5970
james: 0.5350
dirk: 0.4983
ringleaders: 0.4949
feebly: 0.4848
witheringly: 0.4692
aberforth: 0.4628
magorian: 0.4619
mournfully: 0.4601


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a model to extract embeddings
embedding_model = tf.keras.Model(inputs=model.input, outputs=model.get_layer(index=1).output)
embeddings = embedding_model.get_weights()[0]  # Get the weights of the embedding layer

# Function to get embedding for a word
def get_embedding(word):
    word_idx = word_to_idx[word]
    return embeddings[word_idx]

# Compute similarity between the target word and all other words
def get_most_similar_words(target_word, top_n=10):
    target_embedding = get_embedding(target_word)
    similarities = []
    
    for idx, word in idx_to_word.items():
        if word == target_word:
            continue
        word_embedding = embeddings[idx]
        similarity = cosine_similarity([target_embedding], [word_embedding])[0][0]
        similarities.append((word, similarity))
    
    # Sort by similarity
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Example usage
target_word = "harry"
top_related_words = get_most_similar_words(target_word, top_n=10)

print(f"Top related words to '{target_word}':")
for word, similarity in top_related_words:
    print(f"{word}: {similarity:.4f}")


Top related words to 'harry':
hermione: 0.7459
ron: 0.7029
neville: 0.6917
hagrid: 0.6775
he: 0.6774
snape: 0.6748
fred: 0.6644
wormtail: 0.6555
dudley: 0.6297
she: 0.6290
