In [20]:
import os
from torch.utils.data import Dataset
import torch
from wordPieceTokenizer import WordPieceTokenizer
import json
import re
import urllib.request
from collections import defaultdict, Counter
import random
import numpy as np

random.seed(0)

def load_separate_and_clean_stories(filename):
    with open(filename, 'r') as file:
        content = file.read()

    stories = content.split('\n\n\n\n')

    cleaned_stories = []
    for story in stories:
        cleaned_story = re.sub(r'\n\s*\n', '\n', story.strip())
        cleaned_stories.append(cleaned_story)
    
    return cleaned_stories

In [21]:
class MyDataset(Dataset):

    def __init__(self, dict_,mask=False):
        super().__init__()
        self.dict = dict_
        self.mask = mask
    def __len__(self):
        return len(self.dict)

    def __getitem__(self, idx):
        
        text = self.dict[str(idx)]['text']

        tokens = torch.tensor(self.dict[str(idx)]['tokens'])

        if self.mask!=False:
            idxs = np.linspace(0,len(tokens)-1,len(tokens)).astype(int)
            pos = random.choices(idxs, k=int(len(idxs)*0.15))
            tokens[pos] = self.mask

        return text,tokens

In [4]:
# Load the tokenizer
tokenizer = WordPieceTokenizer()
tokenizer.load('wordPieceVocab.json')

# Load the dataset
dataset_txt = load_separate_and_clean_stories("dataset/combined_stories.txt")

dict_ = {}
for i in range(len(dataset_txt)//20):
    dict_[i] = {'text':dataset_txt[i],
                'tokens':tokenizer.encode_n(dataset_txt[i])
                 }

with open("dataset/dataset_dict.json", "w") as outfile: 
    json.dump(dict_, outfile)  # We can read this file to avoid computing again the dict_

In [22]:
with open('dataset/dataset_dict.json') as json_file:
    dict_ = json.load(json_file)
    
complete_dataloader = MyDataset(dict_,tokenizer.word2idx["[MASK]"])

In [24]:
len(complete_dataloader.__getitem__(0)[1])

1597

In [25]:
len(np.where(complete_dataloader.__getitem__(0)[1]==4)[0])

223