In [8]:
import json
import re
from collections import Counter
import numpy as np
import random
import torch
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')
from context_model import ContextModel
import preprocess_books as prep
import utils

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naturalhg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Load data and combine books into a string
combined_books_text = prep.load_book_data('data/train.json')

# Clean/filter text
words_list, word_counts, vocab = prep.preprocess_text(combined_books_text)
print(f'Number of words: {len(words_list)}')
print(f'Number of unique words: {len(vocab)}')
total_num_of_words = len(words_list)
vocab_possibility = []
for word in vocab:
    vocab_possibility.append(word_counts[word] / total_num_of_words)
#check if the vocab_possibility sum to 1
print(f'Vocab possibility sum: {sum(vocab_possibility)}')
#put back to tensor
vocab_possibility = torch.tensor(vocab_possibility + vocab_possibility)


train_data = prep.prepare_training_data(words_list, window_size=10)

print(f'train data shape: {train_data.shape}')
print(f'train sample: {random.choice(train_data)}')

Number of words: 2324461
Number of unique words: 6132
Vocab possibility sum: 1.0000000000000002
train data shape: (232446, 10)
train sample: ['surprised' 'gentleman' 'experienced' 'change' 'feeling' 'drew' 'back'
 'chair' 'took' 'newspaper']


In [None]:
from original_model import OriginalModel

model = OriginalModel(
    N_size=len(vocab),
    kc_size=350,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    debug=False
)

from utils import Encoder
# Create encoder
enc = utils.Encoder(vocab)

# Train model
num_epochs = 15

for i in range(num_epochs):
    for num, sample in enumerate(tqdm(train_data, desc=f'Epoch {i+1}/{num_epochs}', ncols=100, leave=True)):
        enc_sample = enc.one_hot(sample, create_target_vector=True)
        model.learning(enc_sample, vocab_possibility, learning_rate=0.1)
    model.save_checkpoint(f'trained_models/original_model_epoch{i+1}_books.pt')
        

Epoch 1/15: 100%|█████████████████████████████████████████| 232446/232446 [00:43<00:00, 5306.99it/s]
Epoch 2/15:   7%|██▊                                       | 15361/232446 [00:03<00:44, 4869.56it/s]