# Open Subtitles

## Acquiring the Data

### Download the dataset

In [None]:
%%bash

cd /home/datasets/srl/opensubtitles/
mkdir dataset
cd dataset

wget http://opus.lingfil.uu.se/OpenSubtitles2016/en-he.txt.zip
wget http://opus.lingfil.uu.se/OpenSubtitles2016/en.raw.tar.gz
wget http://opus.lingfil.uu.se/OpenSubtitles2016/he.raw.tar.gz

unzip en-he.txt.zip
tar -xzvf en.raw.tar.gz
tar -xzvf he.raw.tar.gz

rm -f en-he.txt.zip en.raw.tar.gz he.raw.tar.gz

### Remove symmetric difference between English and Hebrew

In [None]:
%%python

import os

from collections import defaultdict

os.chdir('./OpenSubtitles2016/raw/')

L = defaultdict(set)
with open('../../OpenSubtitles2016.en-he.ids', encoding='utf-8') as f:
    for line in f:
        en, he, _, _ = line.strip().split('\t')
        L['en'].add(en)
        L['he'].add(he)

for lang in ('en', 'he'):
    for dirpath, dirnames, filenames in os.walk(lang):
        for f in filenames:
            if os.path.join(dirpath, f) not in L[lang]:
                os.remove(os.path.join(dirpath, f))

### Remove empty directories

In [None]:
%%bash

find OpenSubtitles2016/raw -depth -empty -delete

## Imports

In [1]:
import csv
import gzip
import json
import os
import pickle
import regex as re

from collections import Counter, defaultdict, namedtuple
from functools import partial
from glob import glob
from multiprocessing import cpu_count
from operator import attrgetter, itemgetter

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import spacy

from bs4 import BeautifulSoup
from cytoolz import compose
from joblib import Parallel, delayed
from tqdm import tqdm_notebook as tqdm

%matplotlib inline

## Paths

In [2]:
ROOT = r'/home/datasets/srl/opensubtitles/'

DATASET = os.path.join(ROOT, 'dataset')
ARTEFACTS = os.path.join(ROOT, 'artefacts')
XML_PAIRS = os.path.join(ARTEFACTS, 'en2he.json')
ID_ALIGNMENTS = os.path.join(ARTEFACTS, 'alignments.pkl')
ALIGNED_SENTS = os.path.join(ARTEFACTS, '01_aligned_sents')
ENGLISH_SENTS = os.path.join(ARTEFACTS, '02_english_sents')
TOKENIZED_SENTS = os.path.join(ARTEFACTS, '02_tokenized_sents')
YAP_INPUTS = os.path.join(ARTEFACTS, '03_hebrew_processing', 'inputs')
YAP_MA = os.path.join(ARTEFACTS, '03_hebrew_processing', 'ma')
YAP_MD = os.path.join(ARTEFACTS, '03_hebrew_processing', 'md')
YAP_OUTPUTS = os.path.join(ARTEFACTS, '03_hebrew_processing', 'outputs')
PREPROCESSED_SENTS = os.path.join(ARTEFACTS, '04_preprocessed_sents')
FASTALIGN_INPUTS = os.path.join(ARTEFACTS, '05_aligner_inputs', 'fastalign')
EFLOMAL_INPUTS = os.path.join(ARTEFACTS, '05_aligner_inputs', 'eflomal')
EFMARAL_INPUTS = os.path.join(ARTEFACTS, '05_aligner_inputs', 'efmaral')
ALIGNERS_OUTPUT = os.path.join(ARTEFACTS, '06_aligners_output')
ALIGNERS_ALIGNMENT = os.path.join(ARTEFACTS, '06_aligners_output_words')

## Preprocessing

We begin by using the ID alignments file `OpenSubtitles2016.en-he.ids`.

The file is formatted as `<English file>\t<Hebrew file>\t<English sentence IDs>\t<Hebrew sentence IDs>`. We use this data to create a mapping from English files to Hebrew files, and a mapping from English files to pairs of English-Hebrew IDs. In this dataset, the mapping is always one-to-many or many-to-one, so there are no cases where several sentences in English are aligned to several sentences in Hebrew.

In [3]:
def create_en_he_alignment_ids(ids_file):
    en2he = {}
    alignments = defaultdict(list)
    with open(ids_file, encoding='utf-8') as ids:
        for line in tqdm(ids):
            en, he, en_sents, he_sents = line.strip().split('\t')
            en_split = [int(x) for x in en_sents.split()]
            he_split = [int(x) for x in he_sents.split()]
            alignments[en].append((en_split, he_split))
            en2he[en] = he
    with open(XML_PAIRS, 'w', encoding='utf-8') as f:
        json.dump(en2he, f, ensure_ascii=False, indent=4)
    with open(ID_ALIGNMENTS, 'wb') as f:
        pickle.dump(alignments, f, protocol=pickle.HIGHEST_PROTOCOL)
    del en2he, alignments

create_en_he_alignment_ids(os.path.join(DATASET, 'OpenSubtitles2016.en-he.ids'))




Each subtitle file is an XML file where each `s` tag has one sentence, and has an `id` attribute.

Upon establishing which files are subtitles for the same movie, we replace the IDs with the corresponding sentence. At this point, only three problematic characters are removed during this step: newlines and tabs, since there's no reason to have those in the subtitles, and the dash (`-`) character, which is used very often to signify a different person is talking.

In [3]:
def read_id_alignments_from_file(en2he_file, alignments_file):
    with open(en2he_file, encoding='utf-8') as f:
        en2he = json.load(f)
    with open(alignments_file, 'rb') as f:
        alignments = pickle.load(f)
    return en2he, alignments


def id2sent(xml_root):
    remove_symbols = partial(re.sub, r'[\n\t-]', '')
    return {int(s['id']): remove_symbols(s.text).strip()
            for s in BeautifulSoup(xml_root, "xml").find_all('s')}


def create_sentence_pairs(en_xml, he_xml, alignment):
    path = os.path.join(DATASET, 'OpenSubtitles2016', 'raw')    
    with gzip.open(os.path.join(path, en_xml)) as e, gzip.open(os.path.join(path, he_xml)) as h:
        en_sents = id2sent(e)
        he_sents = id2sent(h)
    subs = []
    for en_ids, he_ids in alignment:
        subs.append({
            'en': ' '.join(en_sents[i] for i in en_ids).strip(),
            'he': ' '.join(he_sents[i] for i in he_ids).strip()
        })
    new_file = '_'.join(en_xml[:en_xml.index('.')].split('/')) + '.json'
    with open(os.path.join(ALIGNED_SENTS, new_file), 'w', encoding='utf-8') as f:
        json.dump(subs, f, ensure_ascii=False, indent=4)
    del en_sents, he_sents, subs

In [4]:
en2he, alignments = read_id_alignments_from_file(XML_PAIRS, ID_ALIGNMENTS)
Parallel(n_jobs=6)(delayed(create_sentence_pairs)(en_xml, he_xml, alignments[en_xml]) for en_xml, he_xml in tqdm(en2he.items()))
del en2he, alignments




## Tokenization, POS Tagging, Dependency Parsing

In [None]:
def extract_english_sents(sub):
    with open(os.path.join(ALIGNED_SENTS, sub), encoding='utf-8') as f:
        sents = json.load(f)
    with open(os.path.join(ENGLISH_SENTS, sub[:sub.index('.')] + '.txt'), 'w', encoding='utf-8') as f:
        for sent in sents:
            en_sent = sent['en']
            if en_sent:
                print(en_sent, file=f)


Parallel(n_jobs=cpu_count())(delayed(extract_english_sents)(sub) for sub in tqdm(os.listdir(ALIGNED_SENTS)));

In [3]:
SPACY_KWARGS = {n_threads: cpu_count()}
TOKEN_FIELDS = ('id', 'text', 'pos', 'iob', 'head', 'deprel')

normalize_spaces = compose(' '.join, str.split)
he_pipeline = spacy.load('he')
en_pipeline = spacy.load('en_core_web_md')

get_attrs_of = attrgetter('i', 'text', 'pos_', 'ent_iob_', 'head.i', 'dep_')

def doc_to_dict(doc):
    return [dict(zip(TOKEN_FIELDS, get_attrs_of(t))) for t in doc]

def extract_subtitle_to_json(sub):
    with open(os.path.join(ALIGNED_SENTS, sub), encoding='utf-8') as f:
        sents = json.load(f)
    sub_tuples = [(sent['en'], sent['he']) for sent in sents]
    sub_tuples = ((normalize_spaces(en), normalize_spaces(he)) for en, he in sub_tuples if len(en) > 0 and len(he) > 0)
    ens, hes = zip(*sub_tuples)
    del sub_tuples
    en_tokens = map(doc_to_dict, en_pipeline.pipe(ens, **SPACY_KWARGS))
    he_tokens = map(doc_to_dict, he_pipeline.pipe(hes, **SPACY_KWARGS))
    del ens, hes
    with open(os.path.join(TOKENIZED_SENTS, sub), 'w', encoding='utf-8') as t:
        json.dump({
            'file': sub,
            'en': list(en_tokens),
            'he': list(he_tokens)
        }, t, ensure_ascii=False, indent=4)
    del en_tokens, he_tokens

Parallel(n_jobs=cpu_count())(delayed(extract_subtitle_to_json)(sub) for sub in tqdm(os.listdir(ALIGNED_SENTS)));
del en_pipeline, he_pipeline



    Only loading the 'he' tokenizer.






## Hebrew Processing

We use [YAP](https://github.com/habeanf/yap) for Hebrew morphological analysis, disambiguation, and dependency parsing. First, we create input files in the format expected by YAP.

In [4]:
def create_input_for_yap(sub):
    with open(os.path.join(TOKENIZED_SENTS, sub), encoding='utf-8') as fsub:
        hebrew_subs = json.load(fsub)['he']
    with open(os.path.join(YAP_INPUTS, sub[:sub.index('.json')]), 'w', encoding='utf-8') as fyap:
        for line in hebrew_subs:
            print(*[t['text'] for t in line], sep='\n', end='\n\n', file=fyap)

Parallel(n_jobs=cpu_count())(delayed(create_input_for_yap)(sub) for sub in tqdm(os.listdir(TOKENIZED_SENTS)));




Then we run YAP, saving all intermediate results, just in case.

**Note**: This took one month on a 24-core computer.

In [None]:
%%file /home/datasets/srl/opensubtitles/artefacts/step2/run_yap.py
#!/usr/bin/env python

import os
import subprocess

from joblib import delayed, Parallel
from tqdm import tqdm

ROOT = r'/home/datasets/srl/opensubtitles/artefacts/step2'
INPUTS = os.path.join(ROOT, 'inputs')
MA_DIR = os.path.join(ROOT, 'ma')
MD_DIR = os.path.join(ROOT, 'md')
OUTPUTS = os.path.join(ROOT, 'outputs')

os.chdir(os.path.join(os.getenv('GOPATH'), 'src', 'yap'))


def ma(filename):
    subprocess.Popen(['./yap', 'hebma',
                      '-prefix', 'data/bgulex/bgupreflex_withdef.utf8.hr',
                      '-lexicon', "data/bgulex/bgulex.utf8.hr",
                      '-raw', os.path.join(INPUTS, filename),
                      '-out', os.path.join(MA_DIR, filename)],
                     stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT).wait()


def md(filename):
    subprocess.Popen(['./yap', 'md',
                      '-m', 'data/hebmd',
                      '-f', 'conf/standalone.md.yaml',
                      '-b', '32',
                      '-in', os.path.join(MA_DIR, filename),
                      '-om', os.path.join(MD_DIR, filename)],
                     stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT).wait()

def dp(filename):
    subprocess.Popen(['./yap', 'dep',
                      '-m', 'data/dep',
                      '-f', 'conf/zhangnivre2011.yaml',
                      '-l', 'conf/hebtb.labels.conf',
                      '-inl', os.path.join(MD_DIR, filename),
                      '-oc', os.path.join(OUTPUTS, filename)],
                      stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT).wait()


Parallel(n_jobs=8)(delayed(ma)(filename) for filename in tqdm(os.listdir(INPUTS)))
Parallel(n_jobs=8)(delayed(md)(filename) for filename in tqdm(os.listdir(MA_DIR)))
Parallel(n_jobs=8)(delayed(dp)(filename) for filename in tqdm(os.listdir(MD_DIR)))

Upon completion, we add the newly segmented Hebrew subtitles to our JSON files.

In [11]:
def filename2segmented_hebrew(of):
    df = pd.read_csv(os.path.join(YAP_OUTPUTS, of),
                     sep='\t', header=None,
                     na_filter=False, quoting=csv.QUOTE_NONE)
    df = df[[0, 1, 4, 5, 6, 7]]
    df.columns = ('id', 'form', 'postag', 'feats', 'head', 'deprel')
#     df = df.set_index('ID')
    sub, sent = [], []
    for row in df.itertuples():
        if row.id == 1:
            sub.append(sent)
            sent = []
        sent.append({
            'id': int(row.id - 1),
            'text': row.form,
            'pos': row.postag,
            'iob': '',
            'head': int(row.head - 1),
            'deprel': row.deprel
        })
    sub.append(sent)
    return sub[1:]


def add_segmented_hebrew_to_json(sub, heb_sents):
    with open(os.path.join(TOKENIZED_SENTS, sub), encoding='utf-8') as fsub:
        json_subs = json.load(fsub)
    json_subs['seghe'] = heb_sents
    with open(os.path.join(PREPROCESSED_SENTS, sub), 'w', encoding='utf-8') as fseg:
        json.dump(json_subs, fseg, ensure_ascii=False, indent=4)


def process(f):
    seghe = filename2segmented_hebrew(f[:f.index('.')])
    add_segmented_hebrew_to_json(f, seghe)


Parallel(n_jobs=cpu_count())(delayed(process)(sub) for sub in tqdm(os.listdir(TOKENIZED_SENTS)));

## Alignment

Now that we have both English and Hebrew tokens aligned by sentence, we want to align them by token.
We used three aligners, and inspected their outputs manually. `fast_align` gave the best results.

In [13]:
def create_input_for_aligners(sub):
    with open(os.path.join(PREPROCESSED_SENTS, sub), encoding='utf-8') as fjson:
        d = json.load(fjson)
    en, seghe = d['en'], d['seghe']
    get_text = itemgetter('text')
    en = [map(get_text, sent) for sent in en]
    seghe = [map(get_text, sent) for sent in seghe]
    fn = os.path.splitext(sub)[0]
    with open(os.path.join(FASTALIGN_INPUTS, fn) + '.fa', 'w', encoding='utf-8') as ffa, open(os.path.join(EFMARAL_INPUTS, fn) + '.fa', 'w', encoding='utf-8') as fef:
        for e, h in zip(en, seghe):
            s = '{} ||| {}'.format(' '.join(e), ' '.join(h))
            print(s, file=ffa)
            print(s, file=fef)
    with open(os.path.join(EFLOMAL_INPUTS, fn) + '.en', 'w', encoding='utf-8') as fen, open(os.path.join(EFLOMAL_INPUTS, fn) + '.he', 'w', encoding='utf-8') as fhe:
        for i, e in enumerate(en):
            print('<s snum={i}>{sent}</s>'.format(i=i, sent=' '.join(e)), file=fen)
        for i, h in enumerate(seghe):
            print('<s snum={i}>{sent}</s>'.format(i=i, sent=' '.join(h)), file=fhe)

Parallel(n_jobs=cpu_count())(delayed(create_input_for_aligners)(sub) for sub in tqdm(os.listdir(PREPROCESSED_SENTS)));

In [24]:
%%file /home/datasets/srl/opensubtitles/artefacts/step4/run_aligners.py
#!/usr/bin/env python

import os
import subprocess

from glob import glob
from multiprocessing import cpu_count

from joblib import Parallel, delayed

ROOT = r'/home/datasets/srl/tools/aligners/'
FASTALIGN = os.path.join(ROOT, 'fast_align/build/fast_align')
EFLOMAL = os.path.join(ROOT, 'eflomal/align.py')
EFMARAL = os.path.join(ROOT, 'efmaral/align.py')


def fastalign(fn):
    with open(fn + '.fwd', 'w', encoding='utf-8') as fwd:
        subprocess.Popen([FASTALIGN,
                          '-i', fn,
                          '-d', '-o', '-v'],
                         stdout=fwd).wait()
    with open(fn + '.rev', 'w', encoding='utf-8') as rev:
        subprocess.Popen([FASTALIGN,
                          '-i', fn,
                          '-d', '-o', '-v', '-r'],
                         stdout=rev).wait()


def efmaral(fn):
    with open(fn + '.fwd', 'w', encoding='utf-8') as fwd:
        subprocess.Popen([EFMARAL,
                          '-i', fn], stdout=fwd).wait()
    with open(fn + '.rev', 'w', encoding='utf-8') as rev:
        subprocess.Popen([EFMARAL,
                          '-i', fn,
                          '-r'], stdout=rev).wait()


def eflomal(en, he):
    fn = en[:-3]
    subprocess.Popen([EFLOMAL,
                      '-s', en,
                      '-t', he,
                      '-f', fn + '.fwd',
                      '-r', fn + '.rev']).wait()


if __name__ == '__main__':
    os.chdir(r'/home/datasets/srl/opensubtitles/artefacts/step4/fastalign')
    print('Running fast_align...', end='')
    Parallel(n_jobs=cpu_count())(delayed(fastalign)(fn) for fn in os.listdir('.'))
    print('Done!')

    os.chdir(r'/home/datasets/srl/tools/aligners/eflomal/')
    ens = glob('/home/datasets/srl/opensubtitles/artefacts/step4/eflomal/*.en')
    hes = glob('/home/datasets/srl/opensubtitles/artefacts/step4/eflomal/*.he')
    print('Running eflomal...', end='')
    Parallel(n_jobs=cpu_count())(delayed(eflomal)(en, he) for en, he in zip(sorted(ens), sorted(hes)))
    print('Done!')

    os.chdir(r'/home/datasets/srl/opensubtitles/artefacts/step4/efmaral/')
    print('Running efmaral...', end='')
    Parallel(n_jobs=cpu_count())(delayed(efmaral)(fn) for fn in os.listdir('.'))
    print('Done!')


Overwriting /home/datasets/srl/opensubtitles/artefacts/step4/run_aligners.py


## SRL with Semafor

We now have at our disposal English tokens, Hebrew tokens (segmented), and alignments between them. We used the parsed English tokens as input to Semafor and get SRL for all sentences.

## Final Dataset

After all the work done in this notebook (+SRL as per Semafor's instructions), we have four folders which comprise the entire processed dataset:

* `english_parsed` - Parsed English sentences
* `hebrew_parsed` - Parsed and segmented Hebrew sentences
* `english_srl` - SRL for each English sentence
* `fastalign_outputs` - The English-Hebrew token alignment