In [95]:
import torch 
from torch.utils.data import DataLoader
import requests
import spacy
import re
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
import torchdata.datapipes as dp
import urllib.request
from bs4 import BeautifulSoup

In [436]:
filename = 'train.txt'
file=open(filename)
lines = file.readlines()
lines[0]
file.close()

In [438]:
lines[-1]

'“I thank you, Walton,” he said, “for your kind intentions towards so miserable a wretch; but when you speak of new ties and fresh affections, think you that any can replace those who are gone? Can any man be to me as Clerval was, or any woman another Elizabeth? Even where the affections are not strongly moved by any superior excellence, the companions of our childhood always possess a certain power over our minds which hardly any later friend can obtain. They know our infantine dispositions, which, however they may be afterwards modified, are never eradicated; and they can judge of our actions with more certain conclusions as to the integrity of our motives. A sister or a brother can never, unless indeed such symptoms have been shown early, suspect the other of fraud or false dealing, when another friend, however strongly he may be attached, may, in spite of himself, be contemplated with suspicion. But I enjoyed friends, dear not only through habit and association, but from their own 

In [439]:
output = []
for line in lines:
    if line[:7]=='Chapter' or line=='\n':
        continue
    else:
        output.append(line.lower())
        

In [440]:
file=open(filename, 'a')
file.write(''.join(output))

324704

see
https://betterprogramming.pub/intro-to-rnn-character-level-text-generation-with-pytorch-db02d7e18d89
https://pytorch.org/tutorials/beginner/torchtext_custom_dataset_tutorial.html

In [197]:
eng = spacy.load("en_core_web_sm")
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]

def getTokens(data_iter):
    """
    Function to yield tokens from an iterator
    """
    for eng in data_iter:
        if len(eng) != 0:
            if re.match('[0-9 ]+', eng[0]) == None: # sort out the lines that only contains spaces and numbers
                yield engTokenize(eng[0])
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [252]:
FILE_PATH = 'val.txt'

data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='.', as_tuple=True)
for sample in data_pipe:
    print(engTokenize(sample[0]))
    break

['“', 'it', 'is', 'with', 'considerable', 'difficulty', 'that', 'i', 'remember', 'the', 'original', 'era', 'of', 'my', 'being', ';', 'all', 'the', 'events', 'of', 'that', 'period', 'appear', 'confused', 'and', 'indistinct']


In [253]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe),
    min_freq=2,
            specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
            special_first=True
)
'''
<sos> for start of sentence

<eos> for end of sentence

<unk> for unknown words. An example of unknown word is the one skipped because of min_freq=2.

<pad> is the padding token. 
'''
source_vocab.set_default_index(source_vocab['<unk>']) # setting <unk> instead of that unknown word
print(source_vocab.get_itos()[:30])

['<pad>', '<sos>', '<eos>', '<unk>', ',', 'the', 'i', '“', 'of', 'and', 'a', 'to', 'was', 'in', 'my', 'it', 'when', 'had', 'that', 'on', 'at', 'this', 'which', ';', 'as', 'but', 'by', 'day', 'found', 'with']


In [254]:
temp_list = list(data_pipe)
some_sentence = temp_list[1][0]
print("Some sentence=", end="")
print(some_sentence)
transformed_sentence = getTransform(source_vocab)(engTokenize(some_sentence))
print("Transformed sentence=", end="")
print(transformed_sentence)
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

Some sentence=“it was dark when i awoke; i felt cold also, and half frightened, as it were, instinctively, finding myself so desolate
Transformed sentence=[1, 7, 15, 12, 3, 16, 6, 43, 23, 6, 3, 66, 62, 4, 9, 3, 3, 4, 24, 15, 107, 4, 3, 4, 3, 51, 53, 3, 2]
<sos> “ it was <unk> when i awoke ; i <unk> cold also , and <unk> <unk> , as it were , <unk> , <unk> myself so <unk> <eos> 

In [255]:
def applyTransform(sequence):
    """
    Apply transforms to sequence of tokens and create input & target vectors
    """
    tokenized = engTokenize(sequence[0])
    transformed = getTransform(source_vocab)(tokenized)

    return (transformed[:-1], # X
            transformed[1:]) # target


        
data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator

In [256]:
i = 0
for sample in data_pipe:
    print("\n\nInput sentence=", end="")
    index_to_string = source_vocab.get_itos()
    #for index in sample[0]:
        #print(index_to_string[index], end=" ")
    print("Transformed input sentence=", end="")
    print(sample[0])
    
    print("\n\nOutput sentence=", end="")
    for index in sample[1]:
        print(index_to_string[index], end=" ")
    print("Transformed output sentence=", end="")
    print(sample[1])
    if i > 2:
        break
    i += 1



Input sentence=Transformed input sentence=[1, 7, 15, 3, 29, 67, 3, 18, 6, 95, 5, 3, 3, 8, 14, 65, 23, 3, 5, 3, 8, 18, 93, 3, 3, 9, 3]


Output sentence=“ it <unk> with considerable <unk> that i remember the <unk> <unk> of my being ; <unk> the <unk> of that period <unk> <unk> and <unk> <eos> Transformed output sentence=[7, 15, 3, 29, 67, 3, 18, 6, 95, 5, 3, 3, 8, 14, 65, 23, 3, 5, 3, 8, 18, 93, 3, 3, 9, 3, 2]


Input sentence=Transformed input sentence=[1, 7, 15, 12, 3, 16, 6, 43, 23, 6, 3, 66, 62, 4, 9, 3, 3, 4, 24, 15, 107, 4, 3, 4, 3, 51, 53, 3]


Output sentence=“ it was <unk> when i awoke ; i <unk> cold also , and <unk> <unk> , as it were , <unk> , <unk> myself so <unk> <eos> Transformed output sentence=[7, 15, 12, 3, 16, 6, 43, 23, 6, 3, 66, 62, 4, 9, 3, 3, 4, 24, 15, 107, 4, 3, 4, 3, 51, 53, 3, 2]


Input sentence=Transformed input sentence=[1, 7, 101, 10, 3, 49, 3, 3, 5, 81, 9, 3, 34, 10, 3, 8, 94]


Output sentence=“ soon a <unk> light <unk> <unk> the heavens and <unk> me a <

In [257]:
i = 0
for sample in data_pipe:
    if i == 0:
        size_t = len(sample[0])
        i += 1
    elif size_t < len(sample[0]):
        size_t = len(sample[0])
        print()
        for index in sample[0]:
            print(index_to_string[index], end=" ")
print(size_t)


<sos> “ it was <unk> when i awoke ; i <unk> cold also , and <unk> <unk> , as it were , <unk> , <unk> myself so <unk> 
<sos> “ several <unk> of day and night passed , and the <unk> of night had greatly lessened , when i <unk> to <unk> my <unk> from each <unk> 
<sos> “ one day , when i was <unk> by cold , i found a fire which had been <unk> by some <unk> <unk> , and was <unk> with <unk> at the warmth i experienced from it 
<sos> “ it was <unk> <unk> in the morning , and i longed to <unk> food and shelter ; at <unk> i <unk> a <unk> <unk> , on a <unk> ground , which had <unk> been <unk> for the <unk> of some <unk> 
<sos> “ it was noon when i awoke , and <unk> by the warmth of the sun , which <unk> <unk> on the <unk> ground , i <unk> to <unk> my <unk> ; and , <unk> the <unk> of the <unk> <unk> <unk> in a <unk> i found , i <unk> <unk> the <unk> for several <unk> , until at <unk> i <unk> at a <unk> 66


In [242]:
 def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.PadTransform(77, 0)(T.ToTensor()(pair_of_sequences[0])), 
            T.PadTransform(77, 0)(T.ToTensor()(pair_of_sequences[1])))


data_pipe = data_pipe.map(applyPadding)

In [260]:
class ShakespeareDataSet():
    def __init__(
            self, split: 'val' or 'train', vocab_size: int=30_000
            ):
        super(ShakespeareDataSet, self).__init__()
        self.split = split
        self.vocab_size = vocab_size
        self.eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
        self.data_pipe = self.prepare_data()
        self.dl = self.get_dataloader()
    
    def engTokenize(self, text):
        """
        Tokenize an English text and return a list of tokens
        """
        return [token.text for token in self.eng.tokenizer(text)]

    def getTokens(self, data_iter):
        """
        Function to yield tokens from an iterator
        """
        for eng in data_iter:
            if len(eng) != 0:
                if re.match('[0-9 ]+', eng[0]) == None: # sort out the lines that only contains spaces and numbers
                    yield self.engTokenize(eng[0])
    
    def getTransform(self, vocab):
        """
        Create transforms based on given vocabulary. The returned transform is applied to sequence
        of tokens.
        """
        text_tranform = T.Sequential(
            ## converts the sentences to indices based on given vocabulary
            T.VocabTransform(vocab=vocab),
            ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
            # 1 as seen in previous section
            T.AddToken(1, begin=True),
            ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
            # 2 as seen in previous section
            T.AddToken(2, begin=False)
        )
        return text_tranform
    
    def applyTransform(self, sequence):
        """
        Apply transforms to sequence of tokens and create input & target vectors
        """
        tokenized = self.engTokenize(sequence[0])
        transformed = getTransform(self.source_vocab)(tokenized)

        return (transformed[:-1], # X
                transformed[1:]) # target
    
    def applyPadding(self, pair_of_sequences):
        """
        Convert sequences to tensors and apply padding
        """
        return (T.PadTransform(77, 0)(T.ToTensor()(list(pair_of_sequences[0]))), 
            T.PadTransform(77, 0)(T.ToTensor()(list(pair_of_sequences[1]))))
    
    def prepare_data(self):
        FILE_PATH = 'train.txt'
        data_pipe = dp.iter.IterableWrapper([FILE_PATH])
        data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
        data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\r', as_tuple=False)
        self.source_vocab = build_vocab_from_iterator(
            self.getTokens(data_pipe),
            min_freq=2,
            specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
            special_first=True
        )
        self.source_vocab.set_default_index(self.source_vocab['<unk>']) # setting <unk> instead of that unknown word
        self.len_vocab = len(self.source_vocab)
        data_pipe = data_pipe.map(self.applyTransform) ## Apply the function to each element in the iterator
        data_pipe = data_pipe.map(self.applyPadding)
        return data_pipe

    def get_dataloader(self):
        return DataLoader(dataset=self.data_pipe, batch_size=1)
   

In [261]:
sd=ShakespeareDataSet('train')

In [262]:
first = next(iter(sd.dl))
labels, features = first[1], first[0]
print(f"Labels batch shape: {labels.size()}")
print(f"Feature batch shape: {features.size()}")
print(f"{labels = }\n{features = }")
n_sample = 0
for row in iter(sd.dl):
    n_sample += 1
print(f"{n_sample= }")

Labels batch shape: torch.Size([1, 107])
Feature batch shape: torch.Size([1, 107])
labels = tensor([[   9,   94,   33,  514,   12,    3,    4,    7,   11,  293,   40,   48,
            8,    5,   99, 1020,    8,   16,    3,    6,   11,    3,   17,   47,
           29,  188,  128,    3,    7,    3,    4,    7,   11,   62,   17,  455,
          163,  704,    3,   21,    3,    7,    3,    6,   24,   13, 1800,   33,
           46,   54,  230,   66,   29,   31,    3,    7, 1603,  388,   10,  704,
            3,    6,   24,  136,   31, 1992,  215, 1722,  413,   33,    5,    3,
            8,   31,  214,   15,   12, 1956,    8,  322,   17, 1747,   31, 1653,
          536,    4,  266,   13,   26,  209,    5, 1430,    8,   68,   16,   24,
          130,   12,    3,    7,    5,   62,    8,   12,  293,    6,    2]])
features = tensor([[   1,    9,   94,   33,  514,   12,    3,    4,    7,   11,  293,   40,
           48,    8,    5,   99, 1020,    8,   16,    3,    6,   11,    3,   17,
          

In [427]:
flat_l = labels.view(-1)
l = flat_l.shape
arr = torch.zeros(l[0], len(sd.source_vocab))

In [429]:
arr.shape

torch.Size([107, 1995])

In [430]:
arr[range(arr.shape[0]), flat_l]= 1

In [431]:
arr.shape

torch.Size([107, 1995])

In [435]:
print(arr[-1])

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

In [415]:
flat_l[0]

tensor([   9,   94,   33,  514,   12,    3,    4,    7,   11,  293,   40,   48,
           8,    5,   99, 1020,    8,   16,    3,    6,   11,    3,   17,   47,
          29,  188,  128,    3,    7,    3,    4,    7,   11,   62,   17,  455,
         163,  704,    3,   21,    3,    7,    3,    6,   24,   13, 1800,   33,
          46,   54,  230,   66,   29,   31,    3,    7, 1603,  388,   10,  704,
           3,    6,   24,  136,   31, 1992,  215, 1722,  413,   33,    5,    3,
           8,   31,  214,   15,   12, 1956,    8,  322,   17, 1747,   31, 1653,
         536,    4,  266,   13,   26,  209,    5, 1430,    8,   68,   16,   24,
         130,   12,    3,    7,    5,   62,    8,   12,  293,    6,    2])