In [1]:
import datasets
from datasets import load_dataset
import numpy as np
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


### Dataset Load

In [None]:
load_dataset("iwslt2017", 'iwslt2017-de-en', cache_dir='./data/')

### Extract text from XML Data

In [2]:
import xml.etree.ElementTree as ET


def make_dataset(file_path, docs) :
    f = open(file_path, "w", encoding='utf-8')
    for doc in docs :
        segs = doc.findall('seg')
        for seg in segs :
            text = seg.text.lower()
            try :
                f.write(text)
                f.write('\n')
            except : 
                print(text)
    f.close()

train_folder_path = "./data/de-en/training_and_development/"
test_folder_path = "./data/de-en/test/"

## train english
tree_en = ET.parse(train_folder_path + 'IWSLT17.TED.dev2010.de-en.en.xml')
root_en = tree_en.getroot()
refset = root_en.find('refset')
docs = refset.findall('doc')
make_dataset("./data/de-en/train.en", docs)

## train deutsch
tree_de = ET.parse(train_folder_path + "IWSLT17.TED.dev2010.de-en.de.xml")
root_de = tree_de.getroot()
srcset = root_de.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/train.de", docs)

## test en
tree_en_test = ET.parse(test_folder_path + 'IWSLT17.TED.tst2017.mltlng.en-de.en.xml')
root_en_test = tree_en_test.getroot()
srcset = root_en_test.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/test.en", docs)

## test deutsch
tree_de_test = ET.parse(test_folder_path + 'IWSLT17.TED.tst2017.mltlng.de-en.de.xml')
root_de_test = tree_de_test.getroot()
srcset = root_de_test.find('srcset')
docs = srcset.findall('doc')
make_dataset("./data/de-en/test.de", docs)



### Split Data Set

In [5]:
import xml.etree.ElementTree as ET
train_folder_path = "./data/de-en/training_and_development/"
test_folder_path = "./data/de-en/test/"

## train english
tree_en = ET.parse(train_folder_path + 'IWSLT17.TED.dev2010.de-en.en.xml')
root_en = tree_en.getroot()
refset = root_en.find('refset')
docs = refset.findall('doc')

len(docs)

8

### Make Vocabulary : BPE Encoding

In [31]:
import re

## return (maximum counted pair, count)
## ex - (h st, 10)
def search_max_pair(dict) :
    pair_count = {}
    
    ## pair count
    for word in dict:
        unit_list = word.split(' ')
        count_of_word = dict[word] 
        print(unit_list)
        for i in range(len(unit_list) - 1):
            new_word = unit_list[i] + ' ' + unit_list[i+1] ## insert space for distinguish frontword and backword
            pair_count[new_word] = pair_count[new_word] + count_of_word if (new_word in pair_count) else count_of_word
    ## search maximum pair
    max_count = 0
    max_pair = ''
    for pair in pair_count:
        if pair_count[pair] > max_count :
            max_count = pair_count[pair]
            max_pair = pair
    return (max_pair, max_count)

def merge_word_dict(dict, frontword, backword) :
    new_word_dict = {}
    for word in dict :
        unit_list = word.split(' ')
        new_word = ''
        i=0
        while i < len(unit_list)-1 :
            if unit_list[i] != frontword or unit_list[i+1] != backword : 
                new_word = new_word + unit_list[i] + ' '
            else :
                new_word = new_word + unit_list[i] + unit_list[i+1] + ' '
                i += 1 ## skip next unit
            i += 1
        new_word_dict[new_word] = dict[word] ## change word with new key
    return new_word_dict
                
                
        

def byte_pair_encoding(file_path, count=10) : 
    f = open(file_path, "r", encoding='utf-8')
    stop_word_set = set([',', ' ', '-', '', '\n', '.', '!'])
    word_dict = {}  ## key-bpe encoded words / value-count : word set for calculate maximun counted pair
    vocabulary = {} ## key-vacab word / value-count : vocabulary for encoding, 
    while True :
        line = f.readline()
        if not line :
            break
        
        words = line.split(' ')
        for word in words :
            word = re.sub('[,\-\n!]', '', word)
            if word not in stop_word_set :
                splited_word = ''
                for character in [*word]:
                    splited_word = splited_word + character + ' '
                    vocabulary[character] = vocabulary[character] + 1 if (character in vocabulary) else 1
                splited_word = splited_word + '</w>'
                word_dict[splited_word] = word_dict[splited_word] + 1 if (splited_word in word_dict) else 1

    for i in range(count) :
        #print(vocabulary)
        #print(word_dict)
        max_pair, max_count = search_max_pair(word_dict)
        subwords = max_pair.split(' ')
        frontword = subwords[0]
        backword = subwords[1]
        
        word_dict = merge_word_dict(word_dict, frontword=frontword, backword=backword)
        vocabulary[frontword + backword] = max_count
        vocabulary[frontword] = vocabulary[frontword] - max_count
        vocabulary[backword] = vocabulary[backword] - max_count
        
        
        
#file_path = 'C://Users/DMIS/project/transformer/data/de-en/train.en'
file_path = 'C://Users/DMIS/project/transformer/data/de-en/bpe_ex.en'

byte_pair_encoding(file_path, 3)
        

['l', 'o', 'w', '</w>']
['l', 'o', 'w', 'e', 'r', '</w>']
['n', 'e', 'w', 'e', 's', 't', '</w>']
['w', 'i', 'd', 'e', 's', 't', '</w>']
['l', 'o', 'w', '']
['l', 'o', 'w', 'e', 'r', '']
['n', 'e', 'w', 'es', 't', '']
['w', 'i', 'd', 'es', 't', '']
['l', 'o', 'w', '']
['l', 'o', 'w', 'e', 'r', '']
['n', 'e', 'w', 'est', '']
['w', 'i', 'd', 'est', '']


KeyError: ''

NameError: name 'vocabulary' is not defined