In [1]:
import os
from torch.utils.data import Dataset
import torch
#from wordPieceTokenizer import WordPieceTokenizer
import json
import re
import urllib.request
from collections import defaultdict, Counter
import random
import numpy as np

random.seed(0)
import pandas as pd
def load_separate_and_clean_stories(filename):
    with open(filename, 'r') as file:
        content = file.read()

    stories = content.split('\n\n\n\n')

    cleaned_stories = []
    for story in stories:
        cleaned_story = re.sub(r'\n\s*\n', '\n', story.strip())
        cleaned_stories.append(cleaned_story)
    
    return cleaned_stories

def separate_sentences(text):
    text = text.replace('...','#^')
    text = text.replace('.','~.')
    text = text.replace('?','@?')
    text = text.replace('!','%!')
    
    b = re.split('[.?!^]' , text)                                                                                                                                                                                                                                                                                                                                          
    c = [w.replace('~', '.') for w in b]
    c = [w.replace('@', '?') for w in c]
    c = [w.replace('#', '...') for w in c]
    c = [w.replace('%', '!') for w in c]
    
    return(c)

## V.1

In [21]:
class MyDataset(Dataset):

    def __init__(self, dict_,mask=False):
        super().__init__()
        self.dict = dict_
        self.mask = mask
    def __len__(self):
        return len(self.dict)

    def __getitem__(self, idx):
        
        text = self.dict[str(idx)]['text']

        tokens = torch.tensor(self.dict[str(idx)]['tokens'])

        if self.mask!=False:
            idxs = np.linspace(0,len(tokens)-1,len(tokens)).astype(int)
            pos = random.choices(idxs, k=int(len(idxs)*0.15))
            tokens[pos] = self.mask

        return text,tokens

In [4]:
# Load the tokenizer
tokenizer = WordPieceTokenizer()
tokenizer.load('wordPieceVocab.json')

# Load the dataset
dataset_txt = load_separate_and_clean_stories("dataset/combined_stories.txt")

dict_ = {}
for i in range(len(dataset_txt)//20):
    dict_[i] = {'text':dataset_txt[i],
                'tokens':tokenizer.encode_n(dataset_txt[i])
                 }

with open("dataset/dataset_dict.json", "w") as outfile: 
    json.dump(dict_, outfile)  # We can read this file to avoid computing again the dict_

In [22]:
with open('dataset/dataset_dict.json') as json_file:
    dict_ = json.load(json_file)
    
complete_dataloader = MyDataset(dict_,tokenizer.word2idx["[MASK]"])

In [24]:
len(complete_dataloader.__getitem__(0)[1])

1597

In [25]:
len(np.where(complete_dataloader.__getitem__(0)[1]==4)[0])

223

## V.2

In [43]:
class MyDataset(Dataset):

    def __init__(self, dataset,sentences):
        super().__init__()
        self.dataset = dataset
        self.sentences = sentences
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        
        title = self.dataset.iloc[idx]['Title']
        text = separate_sentences(self.dataset.iloc[idx]['cleaned_story'])
        list_sentences = [''.join(map(str, text[i:i+self.sentences])) for i in range(0, len(text), self.sentences)]

        return title,text,list_sentences

In [44]:
dataset = pd.read_csv('dataset/merged_stories(1).csv')
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Title,cleaned_story
0,0,Thumbelina,"Once upon a time, in a world of wonder and enc..."
1,1,The Star Money,"Once upon a time, in a quaint village nestled ..."
2,2,The Twelve Dancing Princesses,In a kingdom where castles touched the clouds ...
3,3,The Elves and the Shoemaker,In a quaint village nestled at the edge of a l...
4,4,The Fox and the Cat,"Once upon a time, in a lush forest filled with..."


In [47]:
complete_dataloader = MyDataset(dataset,6)
print(complete_dataloader.__getitem__(0)[0])
print(len(complete_dataloader.__getitem__(0)[1]))
print(len(complete_dataloader.__getitem__(0)[2]))


Thumbelina
60
10


In [58]:
t = 'My text...My text. My text! My text?'
separate_sentences(t)

['My text...', 'My text.', ' My text!', ' My text?', '']

## V.3

In [2]:
class Custom_Dataset(Dataset):

    def __init__(self, dataset,sentences):
        super().__init__()
        self.dataset = dataset
        self.sentences = sentences
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        
        title = self.dataset.iloc[idx]['Title']
        text = separate_sentences(self.dataset.iloc[idx]['cleaned_story'])
        list_sentences = [''.join(map(str, text[i:i+self.sentences])) for i in range(0, len(text), self.sentences)]
        it = random.randint(0,len(list_sentences)-2)
        sentence = list_sentences[it]

        if random.random()<0.5:
            next_sentence = list_sentences[it+1]
            is_next = True
            
        else:
            idx2 = idx
            while idx2 == idx:
                idx2 = random.randint(0,len(self.dataset)-1)
            text2 = separate_sentences(self.dataset.iloc[idx2]['cleaned_story'])
            list_sentences2 = [''.join(map(str, text2[i:i+self.sentences])) for i in range(0, len(text2), self.sentences)]
            it = random.randint(0,len(list_sentences2)-1)
            next_sentence = list_sentences2[it]
            
            is_next = False
        return title,sentence,next_sentence,is_next

In [3]:
dataset = pd.read_csv('/Users/francesccarandellverdaguer/fairyTaleAI/dataset/merged_stories_full.csv')
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,Title,cleaned_story
0,0,Thumbelina,"Once upon a time, in a world of wonder and enc..."
1,1,The Star Money,"Once upon a time, in a quaint village nestled ..."
2,2,The Twelve Dancing Princesses,In a kingdom where castles touched the clouds ...
3,3,The Elves and the Shoemaker,In a quaint village nestled at the edge of a l...
4,4,The Fox and the Cat,"Once upon a time, in a lush forest filled with..."


In [4]:
complete_dataloader = Custom_Dataset(dataset,2)
for i in range(len(dataset)):
    try:
        complete_dataloader.__getitem__(i)
    except Exception as e:
        print(e)
        print(dataset.iloc[i].Title)

empty range for randrange() (0, 0, 0)
Icarus and the Wax Wings: A Greek Myth
