# Prepare IWSLT2012 data

Used some code from https://github.com/attilanagy234/neural-punctuator

## Create clean text from unparsed files

In [1]:
import os
import xml.etree.ElementTree as ET
import re
import numpy as np
import pickle
from transformers import BertTokenizer
from tqdm import tqdm

In [2]:
# path of datasets (add / at the end)
dataPath = "IWSLT2012/Data/"

# path of the unparsed files
rawPath = "IWSLT2012/RAW/"

In [3]:
def prepareData(sourcePath, targetPath):
    # parse xml
    xmlp = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(sourcePath, parser=xmlp)
    root = tree.getroot()

    # add segments
    docs = []
    for doc_id in range(len(root[0])):
        doc_segs = []
        doc = root[0][doc_id]
        for seg in doc.iter('seg'):
            doc_segs.append(seg.text)
        docs.append(doc_segs)

    # strip text
    texts = [re.sub(r'\s+', ' ', ''.join(d)).strip() for d in docs]

    # convert to single line text and write to file
    with open(targetPath, 'w', encoding='utf-8') as f:
        text = ' '.join(texts)
        f.write(text + '\n')

In [4]:
prepareData(rawPath + "IWSLT12.TALK.dev2010.en-fr.en.xml", dataPath + "dev_texts.txt")
prepareData(rawPath + "IWSLT12.TED.MT.tst2012.en-fr.en.xml", dataPath + "test_texts_2012.txt")

In [5]:
# train data is in a slightly other format
""" Wrap original file to make it processable with defualt python parser
<?xml version="1.0" encoding="UTF-8"?>
<mteval>
...
</mteval>
"""

trainPath = rawPath + "train.tags.en-fr.en"
targetPath = dataPath + "train_texts.txt"

xmlp = ET.XMLParser(encoding="utf-8")
tree = ET.parse(trainPath, parser=xmlp)
root = tree.getroot()

# add segments
docs = []
for doc in root.iter('transcript'):
    docs.append(doc.text)

# strip text
texts = [re.sub(r'\s+', ' ', d.replace('\n', ' ')).strip() for d in docs]

# write to file
with open(targetPath, 'w', encoding='utf-8') as f:
    text = ' '.join(texts)
    f.write(text + '\n')

## Create Pickle data

In [6]:
# labels
LABEL_NOTHING = 0
LABEL_COMMA = 1
LABEL_PERIOD = 2
LABEL_QUESTION = 3
labelNames = ["O", "COMMA", "PERIOD", "QUESTION"]

# encode the punctuation label as a number
punctEncode = {
    "O": LABEL_NOTHING,
    "COMMA": LABEL_COMMA,
    "PERIOD": LABEL_PERIOD,
    "QUESTION": LABEL_QUESTION
}

# which BERT network to use
modelName = "bert-base-uncased"

# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(modelName)

In [7]:
# loading data
with open(dataPath + "train_texts.txt", 'r', encoding="utf-8") as f:
    train_text = f.read()
with open(dataPath + "dev_texts.txt", 'r', encoding="utf-8") as f:
    valid_text = f.read()
with open(dataPath + "test_texts_2012.txt", 'r', encoding="utf-8") as f:
    test_text = f.read()

# put all datasets together for easy batch operations
datasets = train_text, valid_text, test_text

In [8]:
# prepare data for the model
# code comes mostly from neural-punctuator by attilanagy234
def clean_text(text):
    
    # replacing special tokens
    text = text.replace('!', '.')
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    
    reg = "(?<=[a-zA-Z])-(?=[a-zA-Z]{2,})"
    r = re.compile(reg, re.DOTALL)
    text = r.sub(' ', text)
    
    text = re.sub(r'\s-\s', ' , ', text)
    
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('.\"', ',')
    text = text.replace('"', ',')

    text = re.sub(r'--\s?--', '', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r',\s?\.', '.', text)
    text = re.sub(r'\?\s?\.', '?', text)
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'\s+\?', '?', text)
    text = re.sub(r'\s+,', ',', text)
    text = re.sub(r'\.[\s+\.]+', '. ', text)
    text = re.sub(r'\s+\.', '.', text)
    
    return text.strip().lower()

target_token2id = {t: tokenizer.encode(t)[-2] for t in ",.?"}
target_ids = list(target_token2id.values())
target_ids

id2target = {
    0: 0,
    -1: -1,
}

for i, ti in enumerate(target_ids):
    id2target[ti] = i+1

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in tqdm(words):
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                word = word.rstrip(target_token)
                target = id2target[target_id]

        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
        #print([tokenizer._convert_id_to_token(ew) for ew in encoded_word], target)
        assert(len(encoded_word)>0)

    #encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
    #                encoded_words +\
    #                [tokenizer.sep_token_id or tokenizer.eos_token_id]
    #targets = [-1] + targets + [-1]
    
    return encoded_words, targets

In [9]:
# clean the special characters from the texts
datasets = [clean_text(text) for text in datasets]

# encode the texts and generate labels
encoded_texts, targets = [], []

currentDataset = 0
for ds in datasets:
    currentDataset += 1
    print("PROCESSING DATASET", currentDataset, "/", len(datasets))
    x, y = create_target(ds)
    encoded_texts.append(x)
    targets.append(y)

# make folder for prepared dataset for specific BERT model
os.makedirs(dataPath + modelName, exist_ok=True)

# store
for i, name in enumerate(('train', 'valid', 'test')):
    with open(dataPath + f'{modelName}/{name}_data.pkl', 'wb') as f:
        pickle.dump((encoded_texts[i], targets[i]), f)

PROCESSING DATASET 1 / 3


100%|██████████████████████████████| 2339457/2339457 [02:03<00:00, 18985.34it/s]


PROCESSING DATASET 2 / 3


100%|██████████████████████████████████| 17346/17346 [00:00<00:00, 18850.76it/s]


PROCESSING DATASET 3 / 3


100%|██████████████████████████████████| 18474/18474 [00:00<00:00, 18997.63it/s]
