# Author: Yoonhyuck WOO / JBNU_Industrial Information system Engineering
# Date; 2. 22. 2022 - 2. . 2022
# Title: Korean_NER
# Professor: Seung-Hoon Na

In [3]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import random

In [4]:
def make_random_len_data_list(min_len, max_len, num_data):
    random_data = []
    
    for i in range(num_data):
        sample_len = random.randrange(min_len, max_len)
        sample = [random.randint(0, 9) for ii in range(sample_len)]
        random_data.append(sample)
    
    return random_data

In [5]:
make_random_len_data_list(10, 20, 10)

[[6, 5, 1, 0, 3, 8, 8, 6, 0, 7, 1, 3],
 [2, 9, 3, 4, 5, 5, 0, 4, 2, 6, 9, 7, 0, 6, 1, 7, 9, 5, 2],
 [2, 9, 4, 8, 0, 5, 5, 8, 0, 0, 7],
 [9, 8, 3, 5, 1, 5, 7, 3, 4, 2, 3],
 [2, 2, 3, 0, 8, 6, 8, 5, 3, 7, 7],
 [3, 8, 7, 1, 1, 2, 8, 2, 0, 3],
 [0, 2, 4, 1, 2, 1, 2, 0, 6, 6, 2, 1],
 [0, 3, 4, 0, 8, 7, 2, 9, 8, 2],
 [5, 6, 6, 7, 8, 1, 4, 6, 0, 7, 9, 5, 1],
 [8, 2, 8, 3, 0, 8, 1, 5, 8, 9, 4, 2, 9]]

# __getitem__
 - If slicing is performed in the list while helping to implement slicing, it is important that the '__getitem__ ' method is executed internally. Therefore, the __getitem__ method is essential to slice on an object.
 - In order to implement slicing through the object itself without direct access to the instance variable, the **getitem special method must be defined.** And this function must receive the index as an argument.
 
 # __len__
- By defining a __len_() function in the class, an instance of the class may be transferred to the __len_() function.

In [6]:
class Dataset_custom(Dataset):
    def __init__(self, data):
        self.x = data
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx]

# Padding

In [25]:
def make_same_len(batch):
    
    each_len_list = [len(sample) for sample in batch]
    print('each_len_list', each_len_list)
    
    max_len = max(each_len_list)
    
    padded_batch = []
    pad_id = 0
    
    for sample in batch:
        padded_batch.append(sample + [pad_id] * (max_len - len(sample)))
    
    return padded_batch

In [26]:
rand = make_random_len_data_list(2, 11, 5) # (min_len, max_len, num_data)
example = make_same_len(rand)

print('rand')
print(rand)
print('example')
print(example)

each_len_list [2, 7, 4, 4, 9]
rand
[[2, 7], [1, 5, 3, 8, 6, 1, 1], [5, 4, 1, 7], [7, 8, 6, 1], [9, 8, 6, 1, 9, 1, 7, 1, 7]]
example
[[2, 7, 0, 0, 0, 0, 0, 0, 0], [1, 5, 3, 8, 6, 1, 1, 0, 0], [5, 4, 1, 7, 0, 0, 0, 0, 0], [7, 8, 6, 1, 0, 0, 0, 0, 0], [9, 8, 6, 1, 9, 1, 7, 1, 7]]


In [18]:
def collate_fn_custom(batch):
    
    padded_batch = make_same_len(batch)
    
    padded_batch = torch.tensor(padded_batch)
    
    return padded_batch

In [9]:
rd = make_random_len_data_list(10, 20, 10)
ds = Dataset_custom(rd)

In [10]:
print(len(ds))
ds[0:3]

10


[[8, 5, 6, 1, 5, 0, 9, 5, 3, 9, 5, 3, 5, 4, 3, 5],
 [2, 1, 2, 1, 5, 0, 6, 0, 9, 8, 1],
 [4, 7, 2, 8, 6, 9, 3, 8, 5, 6, 9, 1, 8, 8, 0, 0, 7, 6, 8]]

In [11]:
collate_fn_custom(ds[0:3])

tensor([[8, 5, 6, 1, 5, 0, 9, 5, 3, 9, 5, 3, 5, 4, 3, 5, 0, 0, 0],
        [2, 1, 2, 1, 5, 0, 6, 0, 9, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [4, 7, 2, 8, 6, 9, 3, 8, 5, 6, 9, 1, 8, 8, 0, 0, 7, 6, 8]])

In [12]:
dl = DataLoader (
    ds,
    batch_size = 2,
    shuffle = True,
    collate_fn = collate_fn_custom
)

In [13]:
for i, batch in enumerate(dl):
    print(batch)

tensor([[1, 7, 4, 4, 4, 4, 5, 9, 1, 4, 1, 9, 1, 3, 2],
        [7, 0, 5, 9, 4, 0, 9, 9, 6, 8, 0, 0, 0, 0, 0]])
tensor([[8, 9, 7, 0, 2, 4, 8, 8, 4, 3, 7, 4, 4, 8, 4, 3, 1],
        [9, 6, 0, 0, 7, 2, 6, 5, 4, 6, 0, 0, 0, 0, 0, 0, 0]])
tensor([[8, 5, 6, 1, 5, 0, 9, 5, 3, 9, 5, 3, 5, 4, 3, 5, 0, 0, 0],
        [4, 7, 2, 8, 6, 9, 3, 8, 5, 6, 9, 1, 8, 8, 0, 0, 7, 6, 8]])
tensor([[1, 9, 8, 6, 8, 3, 0, 4, 3, 9, 1, 7, 3, 0],
        [5, 9, 0, 6, 8, 7, 4, 7, 5, 5, 5, 2, 6, 1]])
tensor([[5, 4, 3, 2, 0, 5, 4, 7, 3, 7, 2, 8, 5],
        [2, 1, 2, 1, 5, 0, 6, 0, 9, 8, 1, 0, 0]])
