# Experiments with Text Corpora Datasets

## Imports

In [1]:
import os
import sys

from torch.utils.data import Dataset, DataLoader


In [2]:
# Add the parent directory to the path so we can import modules from the parent directory.
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import lcats.datasets.torchdata as torchdata

In [3]:
if False:  # Code to reload modules if we make local code changes, off by default.
    from importlib import reload
    reload(torchdata)

## Story Dataset

In [None]:
NOTEBOOK_ROOT_DIR = "../data"

def display_dataset(dataset, limit=None):
    print(f"Data Directory: {dataset.data_dir}")
    for index, story in enumerate(dataset):
        print(f" - {story['name']:60} - {len(story['body'])} characters")
        print(f"   {dataset.file_paths[index]}")
        if limit is not None and index >= limit:
            print(f"[{len(dataset) - limit} more stories not shown]")
            break
    print(f"Total stories in dataset: {len(dataset)}")
    print()

complete_dataset = torchdata.JsonDataset(NOTEBOOK_ROOT_DIR) 

display_dataset(complete_dataset, 10)




In [None]:
complete_dataset[0]['metadata']

In [None]:
complete_dataloader = DataLoader(complete_dataset, batch_size=2, shuffle=True)
for batch in complete_dataloader:
    print(batch)


### Sherlock Subdirectory

In [None]:
sherlock_dataset = torchdata.JsonDataset(root_dir=NOTEBOOK_ROOT_DIR,
                                         subdirectory="sherlock")
display_dataset(sherlock_dataset)

In [None]:
sherlock_dataset[0]['metadata']

In [None]:
sherlock_dataloader = DataLoader(sherlock_dataset, batch_size=2, shuffle=True)
for batch in sherlock_dataloader:
    print(batch)


### Lovecraft Subdirectory

In [None]:

lovecraft_dataset = torchdata.JsonDataset(root_dir=NOTEBOOK_ROOT_DIR,
                                          subdirectory="lovecraft")
display_dataset(lovecraft_dataset)

In [None]:
lovecraft_dataset[0]['metadata']

In [None]:
lovecraft_dataloader = DataLoader(lovecraft_dataset, batch_size=2, shuffle=True)
for batch in lovecraft_dataloader:
    print(batch)


## Corpora Directory

Test loading from the main Github corpora, rather than the dynamic local copy.

In [None]:
corpora_dataset = torchdata.JsonDataset("../../corpora") 

display_dataset(corpora_dataset, 10)
