In [4]:
import datasets
from datasets import load_dataset
import numpy as np
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Load

In [None]:
load_dataset("iwslt2017", 'iwslt2017-de-en', cache_dir='./data/')

In [35]:
import xml.etree.ElementTree as ET


def make_dataset(file_path, docs) :
    f = open(file_path, "w")
    for doc in docs :
        segs = doc.findall('seg')
        for seg in segs :
            text = seg.text.lower()
            try :
                f.write(text)
                f.write('\n')
            except : 
                print(text)
    f.close()
    
train_folder_path = "./data/de-en/training_and_development/"
test_folder_path = "./data/de-en/test/"

## train english
tree_en = ET.parse(train_folder_path + 'IWSLT17.TED.dev2010.de-en.en.xml')
root_en = tree_en.getroot()
refset = root_en.find('refset')
docs = refset.findall('doc')
make_dataset("./data/de-en/train.en", docs)

 video: ♪♫ frosty the coal man is a jolly, happy soul.    
 curious historical footnote:  when the moors invaded southern spain, they took this custom with them  and the pronunciation changed over the centuries  from "allah, allah, allah," to "olé, olé, olé,"  which you still hear in bullfights and in flamenco dances.    
 in spain, when a performer has done something impossible and magic,  "allah, olé, olé, allah, magnificent, bravo,"  incomprehensible, there it is -- a glimpse of god.    
 if the divine, cockeyed genius assigned to your case  decides to let some sort of wonderment be glimpsed, for just one moment  through your efforts, then "olé!"    
 and "olé!" to you, nonetheless.    
 "olé!" to you, nonetheless,  just for having the sheer human love and stubbornness  to keep showing up.    
   june cohen: olé!    
 my favorite is the middle one --  the mp3 player, nose hair trimmer, and crème brûlée torch.    


### Extract text from XML Data

In [2]:
import xml.etree.ElementTree as ET


def make_dataset(file_path, docs) :
    f = open(file_path, "w", encoding='utf-8')
    for doc in docs :
        segs = doc.findall('seg')
        for seg in segs :
            text = seg.text.lower()
            try :
                f.write(text)
                f.write('\n')
            except : 
                print(text)
    f.close()

train_folder_path = "./data/de-en/training_and_development/"
test_folder_path = "./data/de-en/test/"

## train english
tree_en = ET.parse(train_folder_path + 'IWSLT17.TED.dev2010.de-en.en.xml')
root_en = tree_en.getroot()
refset = root_en.find('refset')
docs = refset.findall('doc')
make_dataset("./data/de-en/train.en", docs)

## train deutsch
tree_de = ET.parse(train_folder_path + "IWSLT17.TED.dev2010.de-en.de.xml")
root_de = tree_de.getroot()
srcset = root_de.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/train.de", docs)

## test en
tree_en_test = ET.parse(test_folder_path + 'IWSLT17.TED.tst2017.mltlng.en-de.en.xml')
root_en_test = tree_en_test.getroot()
srcset = root_en_test.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/test.en", docs)

## test deutsch
tree_de_test = ET.parse(test_folder_path + 'IWSLT17.TED.tst2017.mltlng.de-en.de.xml')
root_de_test = tree_de_test.getroot()
srcset = root_de_test.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/test.de", docs)



### Split Data Set

In [5]:
import xml.etree.ElementTree as ET
train_folder_path = "./data/de-en/training_and_development/"
test_folder_path = "./data/de-en/test/"

## train english
tree_en = ET.parse(train_folder_path + 'IWSLT17.TED.dev2010.de-en.en.xml')
root_en = tree_en.getroot()
refset = root_en.find('refset')
docs = refset.findall('doc')

len(docs)

8

### Make Vocabulary : BPE Encoding

In [3]:
import re

## return (maximum counted pair, count)
## ex - (h st, 10)
def search_max_pair(dict) :
    pair_count = {}
    
    ## pair count
    for word in dict:
        unit_list = word.split(' ')
        count_of_word = dict[word] 
        #print(unit_list)
        for i in range(len(unit_list) - 1):
            new_word = unit_list[i] + ' ' + unit_list[i+1] ## insert space for distinguish frontword and backword
            pair_count[new_word] = pair_count[new_word] + count_of_word if (new_word in pair_count) else count_of_word
    ## search maximum pair
    max_count = 0
    max_pair = ''
    for pair in pair_count:
        if pair_count[pair] > max_count :
            max_count = pair_count[pair]
            max_pair = pair
    return (max_pair, max_count)

def merge_word_dict(dict, frontword, backword) :
    new_word_dict = {}
    for word in dict :
        unit_list = word.split(' ')
        new_word = ''
        i=0
        while i < len(unit_list) :
            if i == len(unit_list)-1 or unit_list[i] != frontword or unit_list[i+1] != backword : 
                new_word = new_word + unit_list[i] + ' '
            else :
                new_word = new_word + unit_list[i] + unit_list[i+1] + ' '
                i += 1 ## skip next unit
            i += 1
        new_word = new_word[:-1]
        new_word_dict[new_word] = dict[word] ## change word with new key
    return new_word_dict
                
                
        

def byte_pair_encoding(file_path, count=10) : 
    f = open(file_path, "r", encoding='utf-8')
    stop_word_set = set([',', ' ', '-', '', '\n', '.', '!'])
    word_dict = {}  ## key-bpe encoded words / value-count : word set for calculate maximun counted pair
    vocabulary = {} ## key-vacab word / value-count : vocabulary for encoding, 
    vocabulary['</w>'] = 0
    while True :
        line = f.readline()
        if not line :
            break
        words = line.split(' ')
        for word in words :
            word = re.sub('[,\-\n!]', '', word)
            print(word)
            if word not in stop_word_set :
                splited_word = ''
                for character in [*word]:
                    splited_word = splited_word + character + ' '
                    vocabulary[character] = vocabulary[character] + 1 if (character in vocabulary) else 1
                splited_word = splited_word + '</w>'
                word_dict[splited_word] = word_dict[splited_word] + 1 if (splited_word in word_dict) else 1
                vocabulary['</w>'] += 1

    for i in range(count) :
        max_pair, max_count = search_max_pair(word_dict)
        subwords = max_pair.split(' ')
        frontword = subwords[0]
        backword = subwords[1]
        
        word_dict = merge_word_dict(word_dict, frontword=frontword, backword=backword)
        vocabulary[frontword + backword] = max_count
        vocabulary[frontword] = vocabulary[frontword] - max_count
        vocabulary[backword] = vocabulary[backword] - max_count
    print(vocabulary)
        
        
file_path = 'C://Users/DMIS/project/transformer/data/de-en/train.de'
#file_path = 'C://Users/DMIS/project/transformer/data/de-en/bpe_ex.en' ## for test

byte_pair_encoding(file_path, 10)
        

  letztes jahr habe ich diese beiden folien gezeigt, um zu veranschaulichen, dass die arktische eiskappe, die für annähernd drei millionen jahre die grösse der unteren 48 staaten hatte, um 40 prozent geschrumpft ist.  



letztes
jahr
habe
ich
diese
beiden
folien
gezeigt
um
zu
veranschaulichen
dass
die
arktische
eiskappe
die
für
annähernd
drei
millionen
jahre
die
grösse
der
unteren
48
staaten
hatte
um
40
prozent
geschrumpft
ist.


{'</w>': 14, 'l': 2, 'e': 2, 't': 9, 'z': 4, 's': 9, 'j': 2, 'a': 12, 'h': 5, 'r': 8, 'b': 2, 'i': 4, 'c': 0, 'd': 5, 'n': 6, 'f': 3, 'o': 3, 'g': 4, 'u': 6, 'm': 4, 'v': 1, 'k': 2, 'p': 4, 'ü': 1, 'ä': 1, 'ö': 1, '4': 2, '8': 1, '0': 1, '.': 1, 'e</w>': 10, 'en': 1, 'en</w>': 6, 'ch': 5, 'di': 4, 'ei': 4, 'er': 4, 'es': 3, 'li': 3, 't</w>': 3}


In [2]:
import torch
import numpy as np
positions = list(range(0, 10))


p = list(map((lambda elem: np.sin(elem/np.power(10000, elem/512))),  positions[:,0::2] ))
print(p)

TypeError: list indices must be integers or slices, not tuple

In [24]:
import numpy as np

mat = np.zeros((3, 5))

print(mat)  # 3 rows and 5 columns of zeros
mat[0::2, :] = 1
print(mat)  # all of second row is now on
print(mat.shape)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[[1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]]
(3, 5)


In [1]:
import numpy as np

mat = np.zeros(5)

mat2 = np.ones(3)

print(mat + mat2)

ValueError: operands could not be broadcast together with shapes (5,) (3,) 

In [29]:
def positional_vector(i) :
    return [pos/(i+1) for pos in range(10)]
np.array([positional_vector(i) for i in range(10)])

array([[0.        , 1.        , 2.        , 3.        , 4.        ,
        5.        , 6.        , 7.        , 8.        , 9.        ],
       [0.        , 0.5       , 1.        , 1.5       , 2.        ,
        2.5       , 3.        , 3.5       , 4.        , 4.5       ],
       [0.        , 0.33333333, 0.66666667, 1.        , 1.33333333,
        1.66666667, 2.        , 2.33333333, 2.66666667, 3.        ],
       [0.        , 0.25      , 0.5       , 0.75      , 1.        ,
        1.25      , 1.5       , 1.75      , 2.        , 2.25      ],
       [0.        , 0.2       , 0.4       , 0.6       , 0.8       ,
        1.        , 1.2       , 1.4       , 1.6       , 1.8       ],
       [0.        , 0.16666667, 0.33333333, 0.5       , 0.66666667,
        0.83333333, 1.        , 1.16666667, 1.33333333, 1.5       ],
       [0.        , 0.14285714, 0.28571429, 0.42857143, 0.57142857,
        0.71428571, 0.85714286, 1.        , 1.14285714, 1.28571429],
       [0.        , 0.125     , 0.25     

In [1]:
sequence_length = 10
model_dimension = 512
p_encoding_value = torch.zeros(sequence_length, model_dimension)
positions = np.arange(model_dimension)[:, np.newaxis]
p_encoding_value[:, 0::2] += torch.tensor(list(map((lambda elem: np.sin(elem/np.power(10000, elem/model_dimension))),  positions[0::2])))
p_encoding_value[:, 1::2] += torch.tensor(list(map((lambda elem: np.sin(elem/np.power(10000, (elem-1)/model_dimension))),  positions[0::2])))


print(p_encoding_value)

NameError: name 'torch' is not defined

In [52]:
model_dimension = 512
sequence_length = 10

def __make_positional_vector (pos, model_dimension) :
        return [pos/np.power(10000, 2*(hidden_i//2)/ model_dimension) for hidden_i in range(model_dimension)]

positional_encodings = np.array([__make_positional_vector(i, model_dimension) for i in range(sequence_length)])
positional_encodings[:, 0::2] = np.sin(positional_encodings[:, 0::2])
positional_encodings[:, 1::2] = np.cos(positional_encodings[:, 1::2])

print(positional_encodings.shape)
        

(10, 512)


In [78]:

m1 = torch.tensor([[1,1],
          [2,2],
          [3,3]])

value = torch.tensor([[1.,1.],
          [0, 0],
          [0, 0]])
attention = torch.mm(m1, torch.transpose(m1, 0, 1)) / 2
print(attention)

attention_score = torch.nn.functional.softmax(attention, dim=1)
print(attention_score)

attention_value = torch.mm(attention_score, value)
print(attention_value)

tensor([[1., 2., 3.],
        [2., 4., 6.],
        [3., 6., 9.]])
tensor([[0.0900, 0.2447, 0.6652],
        [0.0159, 0.1173, 0.8668],
        [0.0024, 0.0473, 0.9503]])
tensor([[0.0900, 0.0900],
        [0.0159, 0.0159],
        [0.0024, 0.0024]])


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

x_train  =  torch.FloatTensor([[73,  80,  75], 
                               [93,  88,  93], 
                               [89,  91,  90], 
                               [96,  98,  100],   
                               [73,  66,  70]])  
y_train  =  torch.FloatTensor([[152],  [185],  [180],  [196],  [142]])

dataset = TensorDataset(x_train, y_train)

dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [8]:
model = nn.Linear(3,1)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-5) 

In [9]:
model(torch.FloatTensor([1, 1, 1]))

tensor([-0.4383], grad_fn=<AddBackward0>)

In [18]:
torch.triu(torch.ones(3,4), diagonal=0)

tensor([[1., 1., 1., 1.],
        [0., 1., 1., 1.],
        [0., 0., 1., 1.]])

In [19]:
x_train.size()

torch.Size([5, 3])

In [14]:
torch.matmul(x_train, x_train.transpose(1,2))

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [22]:
nb_epochs = 0
for epoch in range(nb_epochs + 1):
  for batch_idx, samples in enumerate(dataloader):
    print(batch_idx)
    # print(samples)
    x, y = samples
    print(x)
    #print(y)
    # H(x) 계산
    prediction = model(x)
    print(prediction)

    # cost 계산
    cost = F.mse_loss(prediction, y)
    print(cost)
    print("---")

    # cost로 H(x) 계산
    #optimizer.zero_grad()
    #cost.backward()
    #optimizer.step()

    #print('Epoch {:4d}/{} Batch {}/{} Cost: {:.6f}'.format(
    #    epoch, nb_epochs, batch_idx+1, len(dataloader),
    #    cost.item()
    #    ))

0
tensor([[ 96.,  98., 100.],
        [ 73.,  66.,  70.]])
tensor([[-28.6199],
        [-22.9941]], grad_fn=<AddmmBackward0>)
tensor(38838.5664, grad_fn=<MseLossBackward0>)
---
1
tensor([[89., 91., 90.],
        [73., 80., 75.]])
tensor([[-24.9711],
        [-18.3826]], grad_fn=<AddmmBackward0>)
tensor(35521.6875, grad_fn=<MseLossBackward0>)
---
2
tensor([[93., 88., 93.]])
tensor([[-29.2420]], grad_fn=<AddmmBackward0>)
tensor(45899.6172, grad_fn=<MseLossBackward0>)
---
