In [5]:
!wget -c "https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV.json.gz"

--2023-06-07 18:11:04--  https://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 1359979688 (1.3G), 1262265512 (1.2G) remaining [application/x-gzip]
Saving to: ‘reviews_Movies_and_TV.json.gz’

d_TV.json.gz         23%[+==>                ] 310.62M   130KB/s    eta 43m 43s^C


In [2]:
!gunzip reviews_Movies_and_TV.json.gz


gzip: reviews_Movies_and_TV.json.gz: unexpected end of file


In [3]:
REVIEWS = 30_000
TRAIN_COUNT, EVALUATE_COUNT = 20_000, 5_000
TEST_COUNT = REVIEWS - TRAIN_COUNT - EVALUATE_COUNT

MINIMUM_FREQUENCY = 5
BATCH_SIZE = 64
NUM_WORKERS = 12
MAX_EPOCHS = 30
LEARNING_RATE = 2e-5
EMBEDDING_DIMENSIONS = 300
HIDDEN_STATE_DIMENSIONS = 400
STACKS = 2

In [4]:
import json
from tqdm import tqdm
from english_data import EnglishData

with open("reviews_Movies_and_TV.json") as f:
    train_reviews = (json.loads(next(f))["reviewText"] for _ in
                     tqdm(range(EVALUATE_COUNT), total=EVALUATE_COUNT, desc="Loading Training Data"))
    validate_reviews = (json.loads(next(f))["reviewText"] for _ in
                        tqdm(range(TRAIN_COUNT), total=TRAIN_COUNT, desc="Loading Validation Data"))

    test_reviews = (json.loads(next(f))["reviewText"] for _ in
                    tqdm(range(TEST_COUNT), total=TEST_COUNT, desc="Loading Testing Data"))

    train_dataset = EnglishData(train_reviews, MINIMUM_FREQUENCY)
    vocabulary = train_dataset.vocabulary
    validate_dataset = EnglishData(validate_reviews, vocabulary=vocabulary)
    test_dataset = EnglishData(test_reviews, vocabulary=vocabulary)


FileNotFoundError: [Errno 2] No such file or directory: 'reviews_Movies_and_TV.json'

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, collate_fn=train_dataset.collate, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
validation_dataloader = DataLoader(validate_dataset, collate_fn=validate_dataset.collate, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
test_dataloader = DataLoader(test_dataset, collate_fn=test_dataset.collate, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

In [None]:
from lightning.pytorch.callbacks import RichProgressBar
import lightning.pytorch as pl
trainer = pl.Trainer(default_root_dir="english_train_movies", max_epochs=MAX_EPOCHS, callbacks=[RichProgressBar()])

In [None]:
from language_model import LitLSTMLanguageModel
model = LitLSTMLanguageModel(LEARNING_RATE, len(vocabulary.get_itos()), EMBEDDING_DIMENSIONS, HIDDEN_STATE_DIMENSIONS, STACKS)

In [None]:
trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=validation_dataloader)

In [None]:
trainer.test(model=model, dataloaders=test_dataloader)

In [None]:
trainer.save_checkpoint("english_movies_lm.ckpt")