# Extracting aligned samples from the OpenSubtitles corpus

(The corpus data is from https://opus.nlpl.eu/TED2020.php which must be cited when using the data.)

The corpus contains sentence-aligned data from many languages. The alignment has been done automatically, each sentence is on one line. Using a custom script (based on a script by Tatjana Scheffler), we extracted all instances of 'wohl' from the German (de) part of the Czech-German aligned TED2020 corpus. We included a line above and below the matching line for context. In some cases, two lines containing a match follow each other, then they are listed as one hit (the way 'grep' does). 

To avoid unwanted matches (as 'obwohl'), we used the first cell to clean the 'grep' results.

Finally, we extracted all aligned lines from the Czech corpus that match the German extracted lines. Thus, 'wohl_match_.txt' lists the German instances, and 'wohl-cs.txt' lists the aligned Czech sentences line-by-line. 


------------
Howto:

get original hits by:
grep -n -B1 -A1 'wohl' TED2020.cs-de.de > wohl.txt

then get matching lines in the other language

In [None]:
import codecs

#tok = "snad" / "asi"
tok = "wohl"

with open(f"{tok}.txt", mode="r", encoding="UTF-8") as infile:
    all_tok = infile.read().split("--\n")

outfile = codecs.open(f"{tok}_match.txt", mode="wb", encoding="UTF-8")

#instead of full tokenization of the files, we get all instances of the token + punctuation
word_list = [f"{tok}", f"{tok}.", f"{tok},", f"{tok}:", f"{tok}?", f"{tok}!"] 

c_tok = 0
for match in all_tok:
    tokens = match.split()
    for token in tokens:
        if token in word_list:
            c_tok += 1
            outfile.write(match + "--\n")

print(c_tok)
outfile.close()

#c_tok wohl = 433
#c_tok asi = 1488
#c_tok snad = 262

In [36]:
#get german matches for czech hits
import sys
import codecs
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
import itertools

import re

cs_snad = codecs.open('asi_match.txt','r', 'utf-8')
outfile = codecs.open ('asi-de.txt', "wb", "utf-8")
de =''
with codecs.open('TED2020\\TED2020.cs-de.de','r', 'utf-8') as cs_file:
    de = cs_file.readlines()

   
for line in cs_snad:
    if line == '--\n':
        outfile.write(line)
    else:
        m = re.search('^(\d*)([:-]*)(.*)', line)
        linenum, sep, text = m.group(1), m.group(2), m.group(3)
        outfile.write(linenum + sep + de[int(linenum)-1])

outfile.close()
cs_snad.close()
    


In [34]:
#get czech matches for german hits
import sys
import codecs
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
import itertools

import re

ger_wohl = codecs.open('wohl_match.txt','r', 'utf-8')
outfile = codecs.open ('wohl-cs.txt', "wb", "utf-8")
cs =''
with codecs.open('TED2020\\TED2020.cs-de.cs','r', 'utf-8') as cs_file:
    cs = cs_file.readlines()

   
for line in ger_wohl:
    if line == '':
        continue
    elif line == '--\n':
        outfile.write(line)
    else:
        m = re.search('^(\d*)([:-]*)(.*)', line)
        linenum, sep, text = m.group(1), m.group(2), m.group(3)
        outfile.write(linenum + sep + cs[int(linenum)-1])

outfile.close()
ger_wohl.close()
    
