# Dataset & preprocessing
https://medium.com/codex/sentencepiece-a-simple-and-language-independent-subword-tokenizer-and-detokenizer-for-neural-text-ffda431e704e

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [3]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
import time
from datasets import load_dataset
import sentencepiece as spm

In [5]:
dataset = load_dataset("iwslt2017",'iwslt2017-en-de')

Downloading builder script:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/206112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8079 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/888 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 206112
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 8079
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 888
    })
})

In [7]:
with open("iwslt2017-en.txt", "w") as f_en, open("iwslt2017-de.txt", "w") as f_de:
    for it in dataset["train"]:
        f_en.write(it["translation"]["en"] + "\n")
        f_de.write(it["translation"]["de"] + "\n")

In [13]:
t1 = time.time()
_ = spm.SentencePieceTrainer.train(
    input='iwslt2017-en.txt',
    model_prefix='en-sp',
    model_type="bpe",
    vocab_size=10000,
    )
print("en-sentpiece google time:", time.time() - t1)

en-sentpiece google time: 12.271727323532104


In [23]:
sp_en = spm.SentencePieceProcessor(model_file='./en-sp.model')

example_en='This is an example sentence'
encoded = sp_en.encode(example_en)
print("len:", len(encoded), encoded)
print("------------------")
print(example_en)
print(sp_en.decode(encoded)) # 다시 decode해도 손실이 없음

len: 5 [305, 54, 96, 807, 4451]
------------------
This is an example sentence
This is an example sentence


In [18]:
with open('iwslt2017-en.txt', 'r') as f:
    en = f.readlines()
with open('iwslt2017-de.txt', 'r') as f:
    de = f.readlines()

In [20]:
import pandas as pd
train_df=pd.DataFrame()
train_df['en']=pd.DataFrame(en)
train_df['de']=pd.DataFrame(de)

In [21]:
train_df

Unnamed: 0,de,en
0,"Vielen Dank, Chris.\n","Thank you so much, Chris.\n"
1,"Es ist mir wirklich eine Ehre, zweimal auf die...",And it's truly a great honor to have the oppor...
2,Ich bin wirklich begeistert von dieser Konfere...,"I have been blown away by this conference, and..."
3,"Das meine ich ernst, teilweise deshalb -- weil...","And I say that sincerely, partly because I ne..."
4,Versetzen Sie sich mal in meine Lage!\n,Put yourselves in my position.\n
...,...,...
206107,Den alten Griechen fiel auch nicht eines Tages...,The Ancient Greeks didn't just wake up one day...
206108,"Wir Menschen brauchten Jahrhunderte, um Sachen...","It took centuries, even, for humans to realize..."
206109,Deshalb müssen wir unser Verständnis von Norma...,And so we must continuously challenge our noti...
206110,Figuren: Danke. Danke. Danke. Danke. Danke.\n,Characters: Thank you. Thank you. Thank you. T...


In [27]:
train_df['en_encoded']=train_df['en'].apply(lambda x:sp_en.encode(x))

In [28]:
t1 = time.time()
_ = spm.SentencePieceTrainer.train(
    input='iwslt2017-de.txt',
    model_prefix='de-sp',
    model_type="bpe",
    vocab_size=10000,
    )
print("de-sentpiece google time:", time.time() - t1)

de-sentpiece google time: 24.256990671157837


In [29]:
sp_de = spm.SentencePieceProcessor(model_file='./de-sp.model')

example_de='das ist ein Beispielsatz'
encoded = sp_de.encode(example_de)
print("len:", len(encoded), encoded)
print("------------------")
print(example_de)
print(sp_de.decode(encoded))

len: 6 [36, 61, 22, 536, 5106, 713]
------------------
das ist ein Beispielsatz
das ist ein Beispielsatz


In [30]:
train_df['de_encoded']=train_df['de'].apply(lambda x:sp_en.encode(x))

In [39]:
train_df

Unnamed: 0,de,en,en_encoded,de_encoded
0,"Vielen Dank, Chris.\n","Thank you so much, Chris.\n","[666, 47, 123, 396, 9951, 2784, 9953]","[842, 7093, 22, 325, 486, 9951, 2784, 9953]"
1,"Es ist mir wirklich eine Ehre, zweimal auf die...",And it's truly a great honor to have the oppor...,"[74, 60, 9956, 9938, 2791, 5, 552, 4962, 25, 1...","[9215, 54, 9933, 4267, 7, 97, 9955, 9941, 233,..."
2,Ich bin wirklich begeistert von dieser Konfere...,"I have been blown away by this conference, and...","[35, 106, 352, 477, 375, 879, 235, 78, 3634, 9...","[35, 88, 8433, 7, 97, 9955, 9941, 233, 58, 142..."
3,"Das meine ich ernst, teilweise deshalb -- weil...","And I say that sincerely, partly because I ne...","[74, 35, 317, 41, 11, 3912, 85, 86, 9951, 7260...","[325, 44, 135, 259, 9931, 233, 9931, 1757, 80,..."
4,Versetzen Sie sich mal in meine Lage!\n,Put yourselves in my position.\n,"[7436, 8509, 34, 163, 2797, 9953]","[842, 152, 63, 3811, 83, 220, 11, 233, 3321, 3..."
...,...,...,...,...
206107,Den alten Griechen fiel auch nicht eines Tages...,The Ancient Greeks didn't just wake up one day...,"[148, 1098, 2540, 4638, 974, 654, 9956, 9933, ...","[8215, 199, 983, 337, 3867, 88, 22, 27, 7093, ..."
206108,"Wir Menschen brauchten Jahrhunderte, um Sachen...","It took centuries, even, for humans to realize...","[171, 937, 5652, 9951, 461, 9951, 91, 1905, 25...","[95, 97, 212, 403, 88, 22, 2262, 390, 983, 456..."
206109,Deshalb müssen wir unser Verständnis von Norma...,And so we must continuously challenge our noti...,"[74, 123, 50, 1379, 9530, 2004, 196, 3792, 31,...","[4110, 7092, 9952, 21, 0, 5052, 22, 7, 97, 476..."
206110,Figuren: Danke. Danke. Danke. Danke. Danke.\n,Characters: Thank you. Thank you. Thank you. T...,"[3431, 4747, 9976, 666, 47, 9953, 666, 47, 995...","[350, 82, 177, 9936, 9976, 4617, 89, 9953, 461..."


In [54]:
max_en_len=max(len(encoded) for encoded in train_df['en_encoded'])
max_de_len=max(len(encoded) for encoded in train_df['de_encoded'])
max_len=max(max_en_len,max_de_len)

In [58]:
for en_sentence, de_sentence in zip(train_df['en_encoded'],train_df['de_encoded']):
    while len(en_sentence) < max_len:
        en_sentence.append(0)
    while len(de_sentence) < max_len:
        de_sentence.append(0)


dataloader

In [46]:
from torch.utils.data import Dataset,DataLoader
import torch

In [64]:
class CustomDataset(Dataset):
  def __init__(self,en,de):
    self.en=torch.tensor(en)
    self.de=torch.tensor(de)
  def __len__(self):
    return len(self.en)
  def __getitem__(self, index):
      return self.en[index], self.de[index]

In [65]:
train_dataset=CustomDataset(train_df['en_encoded'],train_df['de_encoded'])

In [66]:
train_dataloader=DataLoader(train_dataset,batch_size=16,shuffle=True)

In [70]:
for i,(en,de) in enumerate(train_dataloader):
    print(en,de)
    break

tensor([[ 570,   47,  227,  ...,    0,    0,    0],
        [ 126,   35,  463,  ...,    0,    0,    0],
        [ 481, 9956, 9938,  ...,    0,    0,    0],
        ...,
        [9931,    0, 9943,  ...,    0,    0,    0],
        [3627, 9953,    0,  ...,    0,    0,    0],
        [  74,  117, 2136,  ...,    0,    0,    0]]) tensor([[8215, 1173,   83,  ...,    0,    0,    0],
        [  35,   88, 8433,  ...,    0,    0,    0],
        [9093,   43,  401,  ...,    0,    0,    0],
        ...,
        [ 325,  220,  350,  ...,    0,    0,    0],
        [4617,   89, 9953,  ...,    0,    0,    0],
        [1300, 9938, 9931,  ...,    0,    0,    0]])
