In [3]:
import numpy as np
import json
import os
from bs4 import BeautifulSoup
import pandas as pd

This notebook provides to code to preprocess the DA datasets, in order to create train/valid splits and add context to each sentence.

In [4]:
input_dir = "/Users/geovern/Downloads/"
output_dir = "/Users/geovern/Documents/"

# Create original Train/Valid data

In [996]:
campaign = "wmt19"

In [997]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [204]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [None]:
print(len(da))

In [None]:
da.dropna(inplace=True)
print(len(da))

In [None]:
if campaign == "wmt19":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

In [1002]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [1003]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [1004]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [1005]:
len(selected) - len(sampled) == len(train)

True

In [1007]:
train.to_csv("{}/1920-da-train.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [1009]:
sampled.to_csv("{}/1920-da-valid.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

# Add only source context

In [None]:
campaign = "wmt20"

In [None]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [None]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [None]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-src.{}.sgm'.format(input_dir,  "sgm 2" if campaign == "wmt19" else "sgm", campaign.replace("wmt", ""), lp.replace("-", ""), lp.split("-")[0])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<srcset setid=", '</doc>\n', "</p>", "</srcset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = new_line
                elif "<seg id=\"2\">" in line:
                    context[new_line] = " </s> ".join([ref[-1], new_line])
                else:
                    context[new_line] = " </s> ".join([ref[-2], ref[-1], new_line])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        src = da.loc[index, 'src'] 
        da.loc[index, 'src'] = context[src]

lp : ps-en
خو بهتره دا ده چې د نويو کلمو په استعمال کې ځينې خبرې په پام کې وي چې د تيروتنې چانس کم شي: already in context
پورې يې پکې د شاه رخ خان ژوند ، فن او په فلمونو خبرې کړي دي. already in context
که نه د ابراهيم خداى څه بل نوم درلود ؟ په دې اړه د تورات څرگندونې سره ډېر توپير لري. already in context
يوه ويل د ژمي ميلمه زور غواړي ، ها بل ويل په دوبي کې خورا اسانه دى چې ته يې د ژمي خبري کوې. already in context
نامتو مورخ آرنلډ ټاين بي ، چې بشري تاريخ يې لوستى او لس ټوکه کتاب يې پرې ليکلى ، وايي ؛ چې مذهب د بشري ټولنو د جوړښت زړى دى. already in context
ان په دې دوره کې هم د ابن ميمون په ټنډه کې سوله او نېکمرغي نه وه ليکل شوې. already in context
ځينې شنونکي وايي د دغو مشترکو متلونو د خپرېدو يو علت مرکزي اسيا ته د هند او اروپايي ملتونو کډه کېدل دي ، چې بېلابېل فرهنگونه يې سره نژدې کړل. already in context
دى لکه د هندوانو مومن (کوچنى) د دوږخ او جنت تر منځ پروت دى. already in context
ځکه ابن ميمون هغه څوک و ، چې ټولو ليد ، ډېر کلونه بې له دې چې مزد واخلي د هغوى د ناروغانو درملنه يې کړې و

In [None]:
da 

Unnamed: 0,lp,src,mt,ref,z_score,score,annotators
0,ps-en,دجوماتونوپه لوډسپيکرونوکې مخکې له مخکې شعارونه...,Let's search for the joint points between Hind...,Let's analyse the differences between Hinduism...,-2.125683,25.0,1
1,ps-en,دا لسټ له ځان سره وساته کله هم چې له ستونزمن و...,"By coming home, the waiting room is full of Je...","Upon arrival at home, the waiting room would b...",-1.136993,50.0,1
2,ps-en,کار وخت کې داسې قدم ووهه چې قدم وهل ستا د ځان ...,"Adjustment of all the economy, and to eradicat...",The assessment of global economy and eradicati...,-0.148303,75.0,1
3,ps-en,د پايلې په توګه په الاسکا کې د روسي ارتوډوکس ک...,"This structure is made of group wounds, with t...",This structure is organized into a hierarchy o...,-0.108756,76.0,1
4,ps-en,خو ډېر خلک چې کله له یو منفي وضعېت سره مخ شي د...,"On this night, they published a radio and Bara...",On the same night a Soviet radio station aired...,-1.117219,50.5,2
...,...,...,...,...,...,...,...
168869,zh-en,9月27日越盾对美元汇率中间价上调5越盾,"On September 27, 27th, Vietnam Shield vs. US D...","On September 27, the middle rate of the VND ag...",-0.411959,73.0,1
168870,zh-en,9月27日越盾对美元汇率中间价上调5越盾 </s> 在汇率波动幅度为 + / - 3%的情况...,With Exchange rate fluctuations at + /-3 today...,In the case of exchange rate fluctuations of +...,0.759167,89.0,1
168871,zh-en,9月27日越盾对美元汇率中间价上调5越盾 </s> 在汇率波动幅度为 + / - 3%的情况...,The Bank's buying price and selling bid for rm...,The bank's RMB buying and selling prices were ...,-2.022257,51.0,1
168872,zh-en,在汇率波动幅度为 + / - 3%的情况下，今天各家银行美元兑越盾汇率上限为23854越盾，...,Techcombank of Vietnam (Techcombank) has set t...,Techcombank has set the US dollar buying and s...,-2.754210,41.0,1


In [34]:
if campaign == "wmt20":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

KeyError: 'z_score'

In [None]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [None]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [None]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [None]:
len(selected) - len(sampled) == len(train)

True

In [None]:
train.to_csv("{}/1920-da-srcctxtrain.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [None]:
sampled.to_csv("{}/1920-da-srcctxvalid.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [None]:
len(train)

336647

In [None]:
len(valid)

17708

# Add only source context as a separate column

In [41]:
campaign = "wmt20"

In [42]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [43]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [44]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-src.{}.sgm'.format(input_dir,  "sgm 2" if campaign == "wmt19" else "sgm", campaign.replace("wmt", ""), lp.replace("-", ""), lp.split("-")[0])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<srcset setid=", '</doc>\n', "</p>", "</srcset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = " </s> "
                elif "<seg id=\"2\">" in line:
                    context[new_line] = ref[-1]
                else:
                    context[new_line] = " </s> ".join([ref[-2], ref[-1]])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        src = da.loc[index, 'src'] 
        da.loc[index, 'ctx'] = context[src]

lp : ps-en
خو بهتره دا ده چې د نويو کلمو په استعمال کې ځينې خبرې په پام کې وي چې د تيروتنې چانس کم شي: already in context
پورې يې پکې د شاه رخ خان ژوند ، فن او په فلمونو خبرې کړي دي. already in context
که نه د ابراهيم خداى څه بل نوم درلود ؟ په دې اړه د تورات څرگندونې سره ډېر توپير لري. already in context
يوه ويل د ژمي ميلمه زور غواړي ، ها بل ويل په دوبي کې خورا اسانه دى چې ته يې د ژمي خبري کوې. already in context
نامتو مورخ آرنلډ ټاين بي ، چې بشري تاريخ يې لوستى او لس ټوکه کتاب يې پرې ليکلى ، وايي ؛ چې مذهب د بشري ټولنو د جوړښت زړى دى. already in context
ان په دې دوره کې هم د ابن ميمون په ټنډه کې سوله او نېکمرغي نه وه ليکل شوې. already in context
ځينې شنونکي وايي د دغو مشترکو متلونو د خپرېدو يو علت مرکزي اسيا ته د هند او اروپايي ملتونو کډه کېدل دي ، چې بېلابېل فرهنگونه يې سره نژدې کړل. already in context
دى لکه د هندوانو مومن (کوچنى) د دوږخ او جنت تر منځ پروت دى. already in context
ځکه ابن ميمون هغه څوک و ، چې ټولو ليد ، ډېر کلونه بې له دې چې مزد واخلي د هغوى د ناروغانو درملنه يې کړې و

In [45]:
da 

Unnamed: 0,lp,src,mt,ref,z_score,score,annotators,ctx
0,ps-en,راځئ اوس د هندويزم او اسلام ترمنځ گډ ټکي وپلټو...,Let's search for the joint points between Hind...,Let's analyse the differences between Hinduism...,-2.125683,25.0,1,دجوماتونوپه لوډسپيکرونوکې مخکې له مخکې شعارونه...
1,ps-en,کور ته په راتلو سره ، د انتظار کوټه له يهودي ا...,"By coming home, the waiting room is full of Je...","Upon arrival at home, the waiting room would b...",-1.136993,50.0,1,دا لسټ له ځان سره وساته کله هم چې له ستونزمن و...
2,ps-en,د ټولييز اقتصاد تعديل ، او د اقتصادي ستونزو لم...,"Adjustment of all the economy, and to eradicat...",The assessment of global economy and eradicati...,-0.148303,75.0,1,کار وخت کې داسې قدم ووهه چې قدم وهل ستا د ځان ...
3,ps-en,دا جوړښت د ډلو ټپلو د مراتبو جوړ شوی دی، سره د...,"This structure is made of group wounds, with t...",This structure is organized into a hierarchy o...,-0.108756,76.0,1,د پايلې په توګه په الاسکا کې د روسي ارتوډوکس ک...
4,ps-en,په همدې شپه يې دشوروي له يوې راډيودببرک کارمل ...,"On this night, they published a radio and Bara...",On the same night a Soviet radio station aired...,-1.117219,50.5,2,خو ډېر خلک چې کله له یو منفي وضعېت سره مخ شي د...
...,...,...,...,...,...,...,...,...
168869,zh-en,9月27日越盾对美元汇率中间价上调5越盾,"On September 27, 27th, Vietnam Shield vs. US D...","On September 27, the middle rate of the VND ag...",-0.411959,73.0,1,</s>
168870,zh-en,在汇率波动幅度为 + / - 3%的情况下，今天各家银行美元兑越盾汇率上限为23854越盾，...,With Exchange rate fluctuations at + /-3 today...,In the case of exchange rate fluctuations of +...,0.759167,89.0,1,9月27日越盾对美元汇率中间价上调5越盾
168871,zh-en,该银行人民币买入价和卖出价分别为3211越盾和3295越盾，较前一交易日分别下降1越盾。,The Bank's buying price and selling bid for rm...,The bank's RMB buying and selling prices were ...,-2.022257,51.0,1,9月27日越盾对美元汇率中间价上调5越盾 </s> 在汇率波动幅度为 + / - 3%的情况...
168872,zh-en,越南技商股份商业银行（Techcombank）将美元买入价和卖出价分别定为23131越盾和2...,Techcombank of Vietnam (Techcombank) has set t...,Techcombank has set the US dollar buying and s...,-2.754210,41.0,1,在汇率波动幅度为 + / - 3%的情况下，今天各家银行美元兑越盾汇率上限为23854越盾，...


In [46]:
if campaign == "wmt19":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

In [47]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [48]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [49]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [50]:
len(selected) - len(sampled) == len(train)

True

In [54]:
train.to_csv("{}/1920-da-srcgatectxtrain.csv".format(output_dir), columns = ["src", "ctx", "mt", "ref", "score"], index=False)

In [55]:
sampled.to_csv("{}/1920-da-srcgatectxvalid.csv".format(output_dir), columns = ["src", "ctx", "mt", "ref", "score"], index=False)

In [53]:
len(train)

336647

In [None]:
len(valid)

17708

# Add referece and source context

In [238]:
campaign = "wmt20"

In [239]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [240]:
print(len(da))

168874


In [241]:
da.dropna(inplace=True)
print(len(da))

168872


In [242]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [243]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-src.{}.sgm'.format(input_dir, "sgm 2" if campaign == "wmt19" else "sgm",
                                                                               campaign.replace("wmt", ""), 
                                                                               lp.replace("-", ""), 
                                                                               lp.split("-")[0])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<srcset setid=", '</doc>\n', "</p>", "</srcset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = new_line
                else: 
                    context[new_line] = " </s> ".join([ref[-1], new_line])
                # elif "<seg id=\"2\">" in line:
                #     context[new_line] = " </s> ".join([ref[-1], new_line])
                # else:
                #     context[new_line] = " </s> ".join([ref[-2], ref[-1], new_line])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        src = da.loc[index, 'src'] 
        da.loc[index, 'src'] = context[src]

lp : ps-en
خو بهتره دا ده چې د نويو کلمو په استعمال کې ځينې خبرې په پام کې وي چې د تيروتنې چانس کم شي: already in context
پورې يې پکې د شاه رخ خان ژوند ، فن او په فلمونو خبرې کړي دي. already in context
که نه د ابراهيم خداى څه بل نوم درلود ؟ په دې اړه د تورات څرگندونې سره ډېر توپير لري. already in context
يوه ويل د ژمي ميلمه زور غواړي ، ها بل ويل په دوبي کې خورا اسانه دى چې ته يې د ژمي خبري کوې. already in context
نامتو مورخ آرنلډ ټاين بي ، چې بشري تاريخ يې لوستى او لس ټوکه کتاب يې پرې ليکلى ، وايي ؛ چې مذهب د بشري ټولنو د جوړښت زړى دى. already in context
ان په دې دوره کې هم د ابن ميمون په ټنډه کې سوله او نېکمرغي نه وه ليکل شوې. already in context
ځينې شنونکي وايي د دغو مشترکو متلونو د خپرېدو يو علت مرکزي اسيا ته د هند او اروپايي ملتونو کډه کېدل دي ، چې بېلابېل فرهنگونه يې سره نژدې کړل. already in context
دى لکه د هندوانو مومن (کوچنى) د دوږخ او جنت تر منځ پروت دى. already in context
ځکه ابن ميمون هغه څوک و ، چې ټولو ليد ، ډېر کلونه بې له دې چې مزد واخلي د هغوى د ناروغانو درملنه يې کړې و

In [244]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-ref.{}.sgm'.format(input_dir, "sgm 2" if campaign == "wmt19" else "sgm",
                                                                               campaign.replace("wmt", ""), 
                                                                               lp.replace("-", ""), 
                                                                               lp.split("-")[-1])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<refset setid=", '</doc>\n', "</p>", "</refset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = ""
                else: 
                    context[new_line] = ref[-1]
                # elif "<seg id=\"2\">" in line:
                #     context[new_line] = ref[-1]
                # else:
                #     context[new_line] = " </s> ".join([ref[-2], ref[-1]])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        ref = da.loc[index, 'ref']
        mt = da.loc[index, 'mt']
        if len(context[ref]) > 0:
            da.loc[index, 'ref'] = " </s> ".join([context[ref], ref])
            da.loc[index, 'mt'] = " </s> ".join([context[ref], mt])

lp : ps-en
Personal liberties: All people in this system are obliged to work according to the central plan and they are considered as social service and do not have economic freedom. already in context
Above the core is the radiation zone, where the plasma conveys the energy flux by means of radiation. already in context
That we are willing to accept, one we are unwilling to postpone, and one we intend to win. already in context
Combinations are a series of tactical moves executed to achieve some gain. already in context
Separate women - only titles, such as woman grandmaster (WGM), are available. already in context
This allows it to have the lowest individual tax burden in the United States. already in context
The lack of direct access by workers to the means of production and consumption goods. already in context
All of the different cells of an animal are derived from the embryonic germ layers. already in context
The Reserve Army of labour refers to the unemployed and under - employ

In [245]:
if campaign == "wmt19":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

In [246]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [247]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [248]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [249]:
len(selected) - len(sampled) == len(train)

True

In [250]:
train.to_csv("{}/1920-da-ref1ctxtrain.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [251]:
sampled.to_csv("{}/1920-da-ref1ctxvalid.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [252]:
len(train)

336645

In [253]:
len(sampled)

17708

# Add source and reference contexts as separate columns

In [11]:
campaign = "wmt20"

In [12]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [13]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [14]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-src.{}.sgm'.format(input_dir,  "sgm 2" if campaign == "wmt19" else "sgm", campaign.replace("wmt", ""), lp.replace("-", ""), lp.split("-")[0])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<srcset setid=", '</doc>\n', "</p>", "</srcset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = " </s> "
                elif "<seg id=\"2\">" in line:
                    context[new_line] = ref[-1]
                else:
                    context[new_line] = " </s> ".join([ref[-2], ref[-1]])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        src = da.loc[index, 'src'] 
        da.loc[index, 'src_ctx'] = context[src]

lp : ps-en
خو بهتره دا ده چې د نويو کلمو په استعمال کې ځينې خبرې په پام کې وي چې د تيروتنې چانس کم شي: already in context
پورې يې پکې د شاه رخ خان ژوند ، فن او په فلمونو خبرې کړي دي. already in context
که نه د ابراهيم خداى څه بل نوم درلود ؟ په دې اړه د تورات څرگندونې سره ډېر توپير لري. already in context
يوه ويل د ژمي ميلمه زور غواړي ، ها بل ويل په دوبي کې خورا اسانه دى چې ته يې د ژمي خبري کوې. already in context
نامتو مورخ آرنلډ ټاين بي ، چې بشري تاريخ يې لوستى او لس ټوکه کتاب يې پرې ليکلى ، وايي ؛ چې مذهب د بشري ټولنو د جوړښت زړى دى. already in context
ان په دې دوره کې هم د ابن ميمون په ټنډه کې سوله او نېکمرغي نه وه ليکل شوې. already in context
ځينې شنونکي وايي د دغو مشترکو متلونو د خپرېدو يو علت مرکزي اسيا ته د هند او اروپايي ملتونو کډه کېدل دي ، چې بېلابېل فرهنگونه يې سره نژدې کړل. already in context
دى لکه د هندوانو مومن (کوچنى) د دوږخ او جنت تر منځ پروت دى. already in context
ځکه ابن ميمون هغه څوک و ، چې ټولو ليد ، ډېر کلونه بې له دې چې مزد واخلي د هغوى د ناروغانو درملنه يې کړې و

In [15]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-ref.{}.sgm'.format(input_dir, "sgm 2" if campaign == "wmt19" else "sgm",
                                                                               campaign.replace("wmt", ""), 
                                                                               lp.replace("-", ""), 
                                                                               lp.split("-")[-1])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<refset setid=", '</doc>\n', "</p>", "</refset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = " </s> "
                elif "<seg id=\"2\">" in line:
                    context[new_line] = ref[-1]
                else:
                    context[new_line] = " </s> ".join([ref[-2], ref[-1]])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        ref = da.loc[index, 'ref']
        da.loc[index, 'ref_ctx'] = context[ref]

lp : ps-en
Personal liberties: All people in this system are obliged to work according to the central plan and they are considered as social service and do not have economic freedom. already in context
Above the core is the radiation zone, where the plasma conveys the energy flux by means of radiation. already in context
That we are willing to accept, one we are unwilling to postpone, and one we intend to win. already in context
Combinations are a series of tactical moves executed to achieve some gain. already in context
Separate women - only titles, such as woman grandmaster (WGM), are available. already in context
This allows it to have the lowest individual tax burden in the United States. already in context
The lack of direct access by workers to the means of production and consumption goods. already in context
All of the different cells of an animal are derived from the embryonic germ layers. already in context
The Reserve Army of labour refers to the unemployed and under - employ

In [16]:
if campaign == "wmt19":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

In [17]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [18]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [19]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [20]:
len(selected) - len(sampled) == len(train)

True

In [21]:
train.to_csv("{}/1920-da-refgatectxtrain.csv".format(output_dir), columns = ["src", "src_ctx", "mt", "ref", "ref_ctx", "score"], index=False)

In [22]:
sampled.to_csv("{}/1920-da-refgatectxvalid.csv".format(output_dir), columns = ["src", "src_ctx", "mt", "ref", "ref_ctx", "score"], index=False)

In [23]:
len(train)

336647

In [25]:
len(sampled)

17708

# Add referece, translated and source context

In [11]:
campaign = "wmt20"

In [12]:
da = pd.read_csv(filepath_or_buffer="{}/20{}-da.csv".format(input_dir, campaign.replace("wmt", "")), header=0)

In [13]:
print(len(da))

168874


In [14]:
da.dropna(inplace=True)
print(len(da))

168872


In [15]:
if campaign == "wmt19":
    lps = ['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en']
elif campaign == "wmt20":
    lps = list(da["lp"].unique())
else:
    print("Invalid campaign")

In [16]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-src.{}.sgm'.format(input_dir, "sgm 2" if campaign == "wmt19" else "sgm",
                                                                               campaign.replace("wmt", ""), 
                                                                               lp.replace("-", ""), 
                                                                               lp.split("-")[0])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<srcset setid=", '</doc>\n', "</p>", "</srcset>"]
    docs = []
    ref = []
    context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                if new_line in context:
                    print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    context[new_line] = new_line
                else:
                    context[new_line] = " </s> ".join([ref[-1], new_line])
                # elif "<seg id=\"2\">" in line:
                #     context[new_line] = " </s> ".join([ref[-1], new_line])
                # else:
                #     context[new_line] = " </s> ".join([ref[-2], ref[-1], new_line])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    for index in da[da["lp"]==lp].index:
        src = da.loc[index, 'src'] 
        da.loc[index, 'src'] = context[src]

lp : ps-en
خو بهتره دا ده چې د نويو کلمو په استعمال کې ځينې خبرې په پام کې وي چې د تيروتنې چانس کم شي: already in context
پورې يې پکې د شاه رخ خان ژوند ، فن او په فلمونو خبرې کړي دي. already in context
که نه د ابراهيم خداى څه بل نوم درلود ؟ په دې اړه د تورات څرگندونې سره ډېر توپير لري. already in context
يوه ويل د ژمي ميلمه زور غواړي ، ها بل ويل په دوبي کې خورا اسانه دى چې ته يې د ژمي خبري کوې. already in context
نامتو مورخ آرنلډ ټاين بي ، چې بشري تاريخ يې لوستى او لس ټوکه کتاب يې پرې ليکلى ، وايي ؛ چې مذهب د بشري ټولنو د جوړښت زړى دى. already in context
ان په دې دوره کې هم د ابن ميمون په ټنډه کې سوله او نېکمرغي نه وه ليکل شوې. already in context
ځينې شنونکي وايي د دغو مشترکو متلونو د خپرېدو يو علت مرکزي اسيا ته د هند او اروپايي ملتونو کډه کېدل دي ، چې بېلابېل فرهنگونه يې سره نژدې کړل. already in context
دى لکه د هندوانو مومن (کوچنى) د دوږخ او جنت تر منځ پروت دى. already in context
ځکه ابن ميمون هغه څوک و ، چې ټولو ليد ، ډېر کلونه بې له دې چې مزد واخلي د هغوى د ناروغانو درملنه يې کړې و

In [17]:
for lp in lps:
    filepath = '{}/{}/newstest20{}-{}-ref.{}.sgm'.format(input_dir, "sgm 2" if campaign == "wmt19" else "sgm",
                                                                               campaign.replace("wmt", ""), 
                                                                               lp.replace("-", ""), 
                                                                               lp.split("-")[-1])
    with open(filepath, "r") as f:
        lines = f.readlines()
    print("lp : {}".format(lp))
    
    remove = ["<p>", "<refset setid=", '</doc>\n', "</p>", "</refset>"]
    docs = []
    ref = []
    ref_context = {}
    doc_len = 0
    for line in lines:
        if not any(x in line for x in remove):
            if "<doc" in line:
                docs.append(doc_len)
                doc_len = 0
            else:
                doc_len += 1
                new_line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '').rstrip()
                #if new_line in ref_context:
                    #print("{} already in context".format(new_line))
                if "<seg id=\"1\">" in line: 
                    ref_context[new_line] = new_line
                else:
                    ref_context[new_line] = " </s> ".join([ref[-1], new_line])
                # elif "<seg id=\"2\">" in line:
                #     ref_context[new_line] = " </s> ".join([ref[-1], new_line])
                # else:
                #     ref_context[new_line] = " </s> ".join([ref[-2], ref[-1], new_line])
                ref.append(new_line)
    docs.append(doc_len)
    docs = docs[1:]
    print("dataset size : {}".format(sum(docs)))
    directory = '/Users/geovern/Documents/outs/{}/{}/'.format(campaign, lp)
    mt_context = {}
    for sysname in os.listdir(directory):
        if "ref" not in sysname:
            txt = open(directory + sysname, "r").read().splitlines()
            k = 0
            l = 0
            for i, line in enumerate(txt):
                k += 1
                if k <= docs[l]:
                    if k == 1:
                        mt_context[line] = txt[i]
                    else:
                        mt_context[line] = " </s> ".join(txt[i - 1:i + 1])
                    # elif k == 2:
                    #     mt_context[line] = " </s> ".join(txt[i - 1:i + 1])
                    # else:
                    #     mt_context[line] = " </s> ".join(txt[i - 2:i + 1])
                else:
                    k = 1
                    l += 1
                    mt_context[line] = txt[i]
    not_found = 0
    for index in da[da["lp"]==lp].index:
        ref = da.loc[index, 'ref']
        mt = da.loc[index, 'mt']
        if mt in mt_context:
            da.loc[index, 'ref'] = ref_context[ref]
            da.loc[index, 'mt'] = mt_context[mt]
        else:
            not_found += 1
    print("{} sents not found".format(not_found))

lp : ps-en
dataset size : 2719
2 sents not found
lp : en-cs
dataset size : 1418
0 sents not found
lp : en-de
dataset size : 1418
0 sents not found
lp : en-ja
dataset size : 1000
0 sents not found
lp : en-pl
dataset size : 1000
0 sents not found
lp : en-ru
dataset size : 2002
0 sents not found
lp : en-ta
dataset size : 1000
494 sents not found
lp : en-zh
dataset size : 1418
2 sents not found
lp : cs-en
dataset size : 664
0 sents not found
lp : de-en
dataset size : 785
0 sents not found
lp : ja-en
dataset size : 993
5 sents not found
lp : km-en
dataset size : 2320
0 sents not found
lp : pl-en
dataset size : 1001
0 sents not found
lp : ru-en
dataset size : 991
0 sents not found
lp : ta-en
dataset size : 997
467 sents not found
lp : zh-en
dataset size : 2000
1 sents not found


In [18]:
if campaign == "wmt19":
    selected19 = da[da['lp'].isin(['en-de', 'de-en', 'en-cs', 'en-fi', 'en-gu', 'en-kk', 'en-lt', 'en-ru', 'en-zh', 'zh-en'])].reset_index(drop=True)
else:
    selected20 = da[~da['lp'].isin(['iu-en'])].reset_index(drop=True)
    selected20["score"] = selected20["z_score"]

In [19]:
selected = pd.concat([selected19, selected20]).reset_index(drop=True)

In [20]:
sampled = pd.DataFrame()
for lp in selected["lp"].unique():
    selected_lp = selected[selected["lp"] == lp]
    sampled = pd.concat([sampled, selected_lp.sample(n=int(len(selected_lp)*0.05), random_state=1)])

In [21]:
train = pd.concat([sampled,selected]).drop_duplicates(keep=False)

In [22]:
len(selected) - len(sampled) == len(train)

True

In [23]:
train.to_csv("{}/1920-da-ctx1train.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [24]:
sampled.to_csv("{}/1920-da-ctx1valid.csv".format(output_dir), columns = ["src", "mt", "ref", "score"], index=False)

In [25]:
len(train)

336645

In [None]:
len(sampled)

17708

# WMT 20

In [2]:
lp = "en-zh"

In [3]:
filepath = '/Users/geovern/Downloads/sgm/newstest2020-{}-ref.{}.sgm'.format("".join(lp.split("-")), lp.split("-")[-1])

In [4]:
with open(filepath, "r") as f:
    lines = f.readlines()

In [5]:
remove = ["<p>", "<refset setid=", '</doc>\n', "</p>", "</refset>"]
docs = []
ref = []
doc_len = 0
for line in lines:
    if not any(x in line for x in remove):
        if "<doc" in line:
            docs.append(doc_len)
            doc_len = 0
        else:
            doc_len += 1
            line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '')
            ref.append(line.rstrip())
docs.append(doc_len)
docs = docs[1:]

In [6]:
sum(docs)

1418

In [7]:
outpath = "docs/wmt20/{}/".format(lp)
os.makedirs(outpath, exist_ok = True)
with open(outpath + "ids", "w") as fp:
    json.dump(docs, fp) 

# WMT 21 TED

In [119]:
lp = "zh-en"

In [120]:
filepath = '/Users/geovern/Downloads/test/newstest2021.{}.xml'.format(lp)

In [121]:
with open(filepath, 'r') as f:
    data = f.read()

Bs_data = BeautifulSoup(data, "xml")

In [122]:
Bs_data

<?xml version="1.0" encoding="utf-8"?>
<dataset id="newstest2021">
<doc id="xinhua-zh-01.120587" origlang="zh">
<src lang="zh">
<p>
<seg id="1">新华时评：把优秀返乡农民工打造成乡村振兴生力军-新华网</seg>
<seg id="2">乡村振兴，人才是关键。</seg>
<seg id="3">人才哪里找？</seg>
<seg id="4">优秀返乡农民工群体是不可或缺的生力军。</seg>
<seg id="5">新华社发 商海春 作</seg>
<seg id="6">新华社成都7月26日电 题：把优秀返乡农民工打造成乡村振兴生力军</seg>
<seg id="7">今年34岁的张雄家在四川眉山市果园村，2010年他辞掉北京月薪上万的工作返乡创业。</seg>
<seg id="8">经过几年打拼，如今他经营的葡萄种植家庭农场年收入超过百万元，成为小有名气的致富带头人，还被选拔进入村“两委”班子。</seg>
<seg id="9">近年来，在国家脱贫攻坚、乡村振兴等一系列支农惠农政策激励感召下，越来越多像张雄一样的优秀农民工选择返乡创业，激活了乡村一池春水。</seg>
<seg id="10">我国西部人口大省、劳务输出大省四川，因势利导实施“优秀农民工回引培养工程”，仅2019年，就从回引的优秀农民工中产生村党支部书记8000余名，培育村后备力量6.1万名，发展党员2万余名，为打赢脱贫攻坚战和实施乡村振兴战略提供有力保证。</seg>
<seg id="11">小康不小康，关键看老乡。</seg>
<seg id="12">改革开放以来，我国农村大量青壮年劳动力外出务工增收，有的通过打拼创办了自己的企业。</seg>
<seg id="13">但客观上也导致农村“空心化”不同程度地出现，特别是农村基层组织弱化，村党支部书记和党员队伍均呈现出“一老一低”特征，即年龄老化、学历偏低，村党组织带头人队伍不强、青黄不接问题突出。</seg>
<seg id="14">如果乡村人才等要素一直单向流往城市，乡村长期处于“失血” “贫血”状态，振兴就是一句空话。</seg>
<seg id="15">把那些有能力、有觉悟、有

In [117]:
lines = []
for talk in Bs_data.find_all('doc',  testsuite="tsuite-tedtalks-{}".format(lp.replace("-", ""))):
    lines += talk.find('ref', {"translator": "A"}).find_all('seg')

In [118]:
len(lines)

843

In [96]:
len(set(lines))

843

In [97]:
docs = []
ref = []
doc_len = 1
for line in lines:
    line = str(line)
    if '<seg id="1">' in line:
        docs.append(doc_len)
        doc_len = 1
    else:
        doc_len += 1
    line = line.replace('</seg>', '').replace('<seg id="{}">'.format(doc_len), '')
    ref.append(line.rstrip())
docs.append(doc_len)
docs = docs[1:]

In [98]:
np.sum(docs)

843

In [99]:
len(ref)

843

In [115]:
outpath = "docs/wmt21.tedtalks/{}/".format(lp)
os.makedirs(outpath, exist_ok = True)
with open(outpath + "ids", "w") as fp:
    json.dump(docs, fp) 