In [1]:
import json
import re
from collections import Counter
import numpy as np
import random
import torch
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from context_model import ContextModel
import preprocess_books as prep
import utils

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\camer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load data and combine books into a string
combined_books_text = prep.load_book_data('data/train.json')

# Clean/filter text
words_list, word_counts, vocab = prep.preprocess_text(combined_books_text)

# Create training data: np array shape [N, window_size]
train_data = prep.prepare_training_data(words_list, window_size=10)


print(f'train data shape: {train_data.shape}')
print(f'train sample: {random.choice(train_data)}')

train data shape: (232446, 10)
train sample: ['elements' 'regular' '<unk>' 'moral' 'material' 'world' 'depend'
 'complained' 'smoking' 'blood']


In [None]:
# Create model
model = ContextModel(
    K_size= 350,            # Number of neurons
    vocab_size=len(vocab),  # Size of vocab
    k=5,                    # Update top-k neurons
    lr=.1,                  # Learning rate
    norm_rate=5             # Normalization rate
)

# Create encoder
enc = utils.Encoder(vocab)

# Train model
num_epochs = 3

for i in range(num_epochs):

    for num, sample in enumerate(tqdm(train_data, desc=f'Epoch {i+1}/{num_epochs}', ncols=100, leave=True)):        
        enc_sample = enc.one_hot(sample)
        model.update(enc_sample)

# Save model
utils.save_model(model, f'trained_models/context_model_epoch{num_epochs}_books.pt')

Epoch 1/3: 100%|██████████████████████████████████████████| 232446/232446 [01:43<00:00, 2251.27it/s]
Epoch 2/3: 100%|██████████████████████████████████████████| 232446/232446 [01:47<00:00, 2161.46it/s]
Epoch 3/3: 100%|██████████████████████████████████████████| 232446/232446 [01:49<00:00, 2118.14it/s]


In [None]:
# Find words with embeddings most similar to the target word embedding
target_word = 'ship'
hash_length = 70
top_N_closest = 20

#model = utils.load_model('trained_models/context_model_epoch3_books.pt')

utils.calc_print_sim_words(
    vocab=vocab,
    word_counts=word_counts,
    model=model,
    word=target_word,
    hash_len=hash_length,
    top_N=top_N_closest
)

Word            Similarity Frequency 
-----------------------------------
ship                1.000       1203
vessel              0.811        366
ocean               0.811        322
land                0.806        932
waves               0.806        288
conseil             0.806        285
_nautilus_          0.806        507
shore               0.800        586
ned                 0.800        359
canadian            0.800        145
seas                0.794        216
sea                 0.789       1717
board               0.789        575
boat                0.789       1038
screw               0.789         90
crew                0.789        311
floating            0.789        158
observations        0.783         68
island              0.783        628
ice                 0.783        239
