In [2]:
import json
import torch
import os
from torch.utils.data import Dataset, DataLoader
from collections import Counter, defaultdict
import string

# Create data directory if not exists
os.makedirs('data', exist_ok=True)

# Sample data structure
sample_data = [
    {'name': 'John Doe', 'quote': 'Hello world!', 'location': 'New York'},
    {'name': 'Jane Smith', 'quote': 'Python is awesome', 'location': 'London'},
    {'name': 'Bob Wilson', 'quote': 'Deep learning rocks', 'location': 'Paris'}
]

# Create sample files if they don't exist
for split in ['train', 'valid', 'test']:
    filepath = f'data/{split}.json'
    if not os.path.exists(filepath):
        with open(filepath, 'w') as f:
            for item in sample_data:
                f.write(json.dumps(item) + '\n')
        print(f'Created sample {split} file at {filepath}')

# Custom simple tokenizer
def simple_tokenizer(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    return text.split()

# Vocabulary builder
class Vocabulary:
    def __init__(self, special_tokens=('<pad>', '<unk>')):
        self.specials = special_tokens
        self.stoi = {}
        self.itos = {}
        self.counts = Counter()
        
    def build(self, texts, min_freq=1):
        # Add special tokens first
        for idx, tok in enumerate(self.specials):
            self.stoi[tok] = idx
            self.itos[idx] = tok
        
        # Build vocabulary from texts
        counter = Counter()
        for text in texts:
            counter.update(simple_tokenizer(text))
        
        idx = len(self.specials)
        for word, count in counter.items():
            if count >= min_freq:
                self.stoi[word] = idx
                self.itos[idx] = word
                idx += 1
                
    def numericalize(self, text):
        return [
            self.stoi.get(token, self.stoi['<unk>'])
            for token in simple_tokenizer(text)
        ]

# Custom Dataset class
class JSONDataset(Dataset):
    def __init__(self, filepath, name_vocab, saying_vocab, place_vocab):
        self.data = []
        try:
            with open(filepath) as f:
                for line in f:
                    entry = json.loads(line)
                    self.data.append({
                        'name': name_vocab.numericalize(entry['name']),
                        'saying': saying_vocab.numericalize(entry['quote']),
                        'place': place_vocab.numericalize(entry['location'])
                    })
        except FileNotFoundError:
            print(f"Warning: {filepath} not found, using empty dataset")
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return {
            'name': torch.tensor(self.data[idx]['name'], dtype=torch.long),
            'saying': torch.tensor(self.data[idx]['saying'], dtype=torch.long),
            'place': torch.tensor(self.data[idx]['place'], dtype=torch.long)
        }

# Build vocabularies
def build_vocabs(train_file):
    names = []
    sayings = []
    places = []
    
    try:
        with open(train_file) as f:
            for line in f:
                entry = json.loads(line)
                names.append(entry['name'])
                sayings.append(entry['quote'])
                places.append(entry['location'])
    except FileNotFoundError:
        print(f"Error: {train_file} not found, using sample data")
        return Vocabulary(), Vocabulary(), Vocabulary()
    
    name_vocab = Vocabulary()
    saying_vocab = Vocabulary()
    place_vocab = Vocabulary()
    
    name_vocab.build(names)
    saying_vocab.build(sayings)
    place_vocab.build(places)
    
    return name_vocab, saying_vocab, place_vocab

# Data paths
train_file = 'data/train.json'
valid_file = 'data/valid.json'
test_file = 'data/test.json'

# Build vocabularies from training data
name_vocab, saying_vocab, place_vocab = build_vocabs(train_file)

# Create datasets
train_dataset = JSONDataset(train_file, name_vocab, saying_vocab, place_vocab)
valid_dataset = JSONDataset(valid_file, name_vocab, saying_vocab, place_vocab)
test_dataset = JSONDataset(test_file, name_vocab, saying_vocab, place_vocab)

# Custom collate function
def collate_fn(batch):
    return {
        'name': [item['name'] for item in batch],
        'saying': [item['saying'] for item in batch],
        'place': [item['place'] for item in batch]
    }

# Create dataloaders
BATCH_SIZE = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# Example usage
print("\nFirst training example:")
if len(train_dataset) > 0:
    print(train_dataset[0])
else:
    print("No training data found")

print("\nTraining batches:")
try:
    for batch in train_loader:
        print(batch)
        break
except:
    print("No training data available")

print("\nValidation batches:")
try:
    for batch in valid_loader:
        print(batch)
        break
except:
    print("No validation data available")

print("\nTest batches:")
try:
    for batch in test_loader:
        print(batch)
        break
except:
    print("No test data available")

Created sample train file at data/train.json
Created sample valid file at data/valid.json
Created sample test file at data/test.json

First training example:
{'name': tensor([2, 3]), 'saying': tensor([2, 3]), 'place': tensor([2, 3])}

Training batches:
{'name': [tensor([2, 3])], 'saying': [tensor([2, 3])], 'place': [tensor([2, 3])]}

Validation batches:
{'name': [tensor([2, 3])], 'saying': [tensor([2, 3])], 'place': [tensor([2, 3])]}

Test batches:
{'name': [tensor([2, 3])], 'saying': [tensor([2, 3])], 'place': [tensor([2, 3])]}
