## Load BSD Dev Set

In [1]:
import pandas as pd
df = pd.read_json('bsd_dev.json')
print(df['tag'].unique())
df_convs = pd.concat([pd.json_normalize(df['conversation'][i]) for i in range(len(df))],ignore_index=True)
english = df_convs["en_sentence"].values.tolist()
japanese = df_convs["ja_sentence"].values.tolist()

['training' 'meeting' 'phone call' 'general chatting'
 'face-to-face conversation' 'presentation']


## Translate with M2M100

In [9]:
# https://huggingface.co/docs/transformers/model_doc/m2m_100
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

japanese_text = "私はアンドレです。"

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# translate Chinese to English
tokenizer.src_lang = "ja"
encoded_ja = tokenizer(japanese_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_ja, forced_bos_token_id=tokenizer.get_lang_id("en"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['I am Andrea.']

In [21]:
#translate
from tqdm import tqdm

preds = []
for i, ja in tqdm(enumerate(japanese)):
    encoded_ja = tokenizer(ja, return_tensors="pt")
    generated_tokens = model.generate(**encoded_ja, forced_bos_token_id=tokenizer.get_lang_id("en"))
    preds.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])

2051it [37:46,  1.10s/it]


In [22]:
#save to csv
sample = {'Source_Ja': japanese,
         'Target_En': english,
         'Translation_En': preds}

# creating the DataFrame
df = pd.DataFrame(sample)
df.to_csv('example.tsv', sep="\t", index=False)

## Translate with MarianMT - Helsinki-NLP/opus-mt-ja-en

In [5]:
# https://huggingface.co/docs/transformers/model_doc/marian
from transformers import MarianMTModel, MarianTokenizer

src_text = ["私はアンドレです。"]

model_name = "Helsinki-NLP/opus-mt-ja-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

["I'm Andre."]

In [None]:
#translate
from tqdm import tqdm

preds1 = []
for i, ja in tqdm(enumerate(japanese)):
    translated = model.generate(**tokenizer(ja, return_tensors="pt", padding=True))
    preds1.append([tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])

In [None]:
#save to csv
sample1 = {'Source_Ja': japanese,
         'Target_En': english,
         'Translation_En': preds1}

# creating the DataFrame
df1 = pd.DataFrame(sample1)
df1.to_csv('bsd_dev_translations_marianmt.tsv', sep="\t", index=False)