In [114]:
import os
import json
import pandas as pd
import numpy as np
import nltk.data
from tqdm.notebook import tqdm
import time
from pathlib import Path
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/yuchen/kaggle/chaii/working/translate/chaii-325014-09f84f361067.json"

def retry(fun, max_tries=10):
    for i in range(max_tries):
        try:
            time.sleep(0.3) 
            fun()
            break
        except Exception:
            continue

def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, format_='text', source_language='en', target_language=target)

#     print(u"Text: {}".format(result["input"]))
#     print(u"Translation: {}".format(result["translatedText"]))
#     print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result["translatedText"]

def translate_texts(target, text):
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()
    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, format_='text', source_language='en', target_language=target)

#     print(u"Text: {}".format(result["input"]))
#     print(u"Translation: {}".format(result["translatedText"]))
#     print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result

def translate_text_use_cache(target, text, only_cache=False):
    if text in translated_cache:
        return translated_cache[text]
    if only_cache:
        raise KeyError
    text_translated = translate_text(target, text)
    translated_cache[text] = text_translated
    return text_translated

def split2sentence(paragraph):
    slist = tokenizer.tokenize(paragraph)
    sentence_lengths = [len(x)+1 for x in slist]
    if np.sum(sentence_lengths)-1 != len(paragraph):
        return None, None
    sentence_starts = [0]
    for sl in sentence_lengths[:-1]:
        sentence_starts.append(sentence_starts[-1]+sl)
    return slist, sentence_starts

def get_sentence_idx(sentence_starts, answer_start_char_idx):
    n_sentence = len(sentence_starts)
    for i, sidx in enumerate(sentence_starts):
        if i == n_sentence-1:
            return i
        if answer_start_char_idx >= sidx and answer_start_char_idx < sentence_starts[i+1]:
            return i

def join_slist(slist, sentence_idx, char_idx):
    cursor = 0
    sentence_lengths = [len(x)+1 for x in slist]
    for i in range(sentence_idx):
        cursor += sentence_lengths[i]
    return ' '.join(slist), cursor+char_idx

def translate_impossible(context, question, target="ta", only_cache=True):
    slist = tokenizer.tokenize(context)
    slist_translated = ['' for _ in range(len(slist))]
    for i, s in enumerate(slist):
        slist_translated[i] = translate_text_use_cache(target, s, only_cache)
    context_translated = ' '.join(slist_translated)
    question_translated = translate_text_use_cache(target, question, only_cache)
    return {
            'context': context_translated,
            'question': question_translated,
            'answer_text': '',
            'answer_start': -1,
    }


def translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target="ta", verbose=False, only_cache=True):
    slist, sentence_starts = split2sentence(context)
    if slist is None:
        if verbose: print('tokenized length not match')
        return 'tokenized length not match'
    start_char_sentence_idx = get_sentence_idx(sentence_starts, answer_start_char_idx)
    end_char_sentence_idx = get_sentence_idx(sentence_starts, answer_end_char_idx)
    if start_char_sentence_idx != end_char_sentence_idx:
        if verbose: print('start and end not in same sentence')
        return 'start and end not in same sentence'
    sentence_containing_answer = slist[start_char_sentence_idx]
    if sentence_containing_answer.count(answer) != 1:
        if verbose: print('answer does not occur once in the original sentence')
        return 'answer does not occur once in the original sentence'
    sentence_containing_answer_translated = translate_text_use_cache(target, sentence_containing_answer, only_cache)
    answer_translated = translate_text_use_cache(target, answer, only_cache)
    answer_occurence = sentence_containing_answer_translated.count(answer_translated)
    if answer_occurence != 1:
        if verbose: print('answer does not occur once in the translated sentence')
        return 'answer does not occur once in the translated sentence'
    answer_insentence_idx = sentence_containing_answer_translated.find(answer_translated)
    slist_translated = ['' for _ in range(len(slist))]
    slist_translated[start_char_sentence_idx] = sentence_containing_answer_translated
    for i, s in enumerate(slist):
        if i == start_char_sentence_idx: continue
        slist_translated[i] = translate_text_use_cache(target, s, only_cache)
    context_translated, answer_start_char_idx_translated = join_slist(slist_translated, start_char_sentence_idx, answer_insentence_idx)
    question_translated = translate_text_use_cache(target, question, only_cache)
    return {
            'context': context_translated,
            'question': question_translated,
            'answer_text': answer_translated,
            'answer_start': answer_start_char_idx_translated,
    }

def estimate_price(translated_cache):
    unit_price = 20 / 1000000
    token_count = np.sum([len(x) for x in list(translated_cache.keys())])
    return token_count * unit_price

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answer_texts = []
    answer_starts = []
    imp = 0
    p = 0
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                is_impossible = qa['is_impossible']
                question = qa['question']
                contexts.append(context)
                questions.append(question)
                if is_impossible:
                    imp += 1
                    answer_texts.append('')
                    answer_starts.append(-1)
#                     answers.append({'text': '', 'answer_start': -1})
                else:
                    p += 1
                    text = qa['answers'][0]['text']
                    answer_start = qa['answers'][0]['answer_start']
                    answer_texts.append(text)
                    answer_starts.append(answer_start)
#                     answers.append({'text': text, 'answer_start': answer_start})
    print(f'imp {imp}, p {p}')
    data = {'id': [i for i in range(len(contexts))], 'context': contexts, 'question': questions, 'answer': answer_texts, 'answer_start_char_idx': answer_starts}
    answer_end_char_idx = []
    for i in range(len(answer_texts)):
        answer_end_char_idx.append(answer_starts[i]+len(answer_texts[i]))
    data['answer_end_char_idx'] = answer_end_char_idx
    return pd.DataFrame(data)

In [70]:
squad2 = read_squad('../../input/squad2/train-v2.0.json')
squad2

imp 43498, p 86821


Unnamed: 0,id,context,question,answer,answer_start_char_idx,answer_end_char_idx
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,286
1,1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,226
2,2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,530
3,3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,180
4,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,286
...,...,...,...,...,...,...
130314,130314,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,,-1,-1
130315,130315,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,,-1,-1
130316,130316,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,,-1,-1
130317,130317,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,,-1,-1


In [91]:
unique_contexts = squad2.context.unique()
unique_contexts
context_sentences = []
for c in unique_contexts:
    ss = tokenizer.tokenize(c)
    context_sentences.extend(ss)
context_sentences = list(set(context_sentences))
unique_questions = squad2.question.unique().tolist()
unique_answers = squad2.answer.unique().tolist()
len(unique_contexts), len(context_sentences), len(unique_questions), len(unique_answers)

(19029, 93697, 130217, 64764)

In [101]:
to_translate = context_sentences + unique_questions + unique_answers
temp = []
for sen in to_translate:
    if sen not in translated_cache:
        temp.append(sen)
to_translate = temp
del temp
len(to_translate)

194768

In [104]:
dups = []
bs = 128
for i in tqdm(range(0, len(to_translate), bs)):
    batch = to_translate[i:i+bs]
    response = translate_texts('ta', batch)
    for res in response:
        if res['input'] not in translated_cache:
            translated_cache[res['input']] = res['translatedText']
        else:
            dups.append(res['input'])
            print('dup..')

  0%|          | 0/1522 [00:00<?, ?it/s]

In [110]:
with open('squad2_en2ta_0905.json', 'w') as f:
    json.dump(translated_cache, f)

In [111]:
idx = 2075
context = squad2.loc[idx, 'context']
question = squad2.loc[idx, 'question']
answer = squad2.loc[idx, 'answer']
answer_start_char_idx = squad2.loc[idx, 'answer_start_char_idx']
answer_end_char_idx = squad2.loc[idx, 'answer_end_char_idx']

In [112]:
context

'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]'

In [113]:
# res = translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target="ta")
res = translate_impossible(context, question)
res

{'context': 'தி லெஜண்ட் ஆஃப் செல்டா: ட்விலைட் இளவரசி (ஜப்பானிய: ゼ ル ダ の 伝 ト ワ イ ラ イ ト プ ン ン ス ス He He, ஹெப்பர்ன்: ஜெருடா நோ டென்செட்சு: டோவைரைடோ புரிஞ்சே?) கேம் கியூப் மற்றும் வீ ஹோம் வீடியோ கேம் கன்சோல்களுக்காக நிண்டெண்டோவால் உருவாக்கப்பட்டு வெளியிடப்பட்ட ஒரு அதிரடி-சாகச விளையாட்டு. இது தி லெஜண்ட் ஆஃப் செல்டா தொடரின் பதின்மூன்றாவது தவணை. முதலில் நவம்பர் 2005 இல் கேம் கியூபில் வெளியிட திட்டமிடப்பட்டது, ட்விலைட் இளவரசி நிண்டெண்டோவால் தாமதப்படுத்தப்பட்டது, அதன் டெவலப்பர்கள் விளையாட்டைச் செம்மைப்படுத்தவும், அதிக உள்ளடக்கத்தைச் சேர்க்கவும் மற்றும் வைக்கு போர்ட் செய்யவும் அனுமதித்தது. வீ பதிப்பு நவம்பர் 2006 இல் வட அமெரிக்காவில் கன்சோலுடன் வெளியிடப்பட்டது, அடுத்த மாதம் ஜப்பான், ஐரோப்பா மற்றும் ஆஸ்திரேலியாவில் வெளியிடப்பட்டது. கேம் கியூப் பதிப்பு டிசம்பர் 2006 இல் உலகளவில் வெளியிடப்பட்டது. [b]',
 'question': 'லெஜண்ட் ஆஃப் ஜெல்டாவின் எந்த வகை விளையாட்டு: ஆஸ்திரேலியா ட்விலைட்?',
 'answer_text': '',
 'answer_start': -1}

In [115]:
def main():
    max_retry = 3
    target = 'ta'
    err_counter = 0
    imp_counter = 0
    for idx in tqdm(range(len(squad2))):
        context = squad2.loc[idx, 'context']
        question = squad2.loc[idx, 'question']
        answer = squad2.loc[idx, 'answer']
        answer_start_char_idx = squad2.loc[idx, 'answer_start_char_idx']
        answer_end_char_idx = squad2.loc[idx, 'answer_end_char_idx']
        for i in range(max_retry):
            try:
                if answer_start_char_idx == -1:
                    res = translate_impossible(context, question, only_cache=True)
                    imp_counter += 1
                    break
                else:
                    res = translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target=target, verbose=False, only_cache=True)
                    break
            except:
                print(f'error encountered at step {idx}, retry counter {i}')
                res = 'not in cache'
                continue
        if isinstance(res, str):
            squad2.loc[idx, f'error_{target}'] = res
            err_counter += 1
        else:
            squad2.loc[idx, f'error_{target}'] = 'success'
            squad2.loc[idx, f'context_{target}'] = res['context']
            squad2.loc[idx, f'question_{target}'] = res['question']
            squad2.loc[idx, f'answer_text_{target}'] = res['answer_text']
            squad2.loc[idx, f'answer_start_{target}'] = res['answer_start']
        if idx % 50 == 0:
            squad2.to_csv('squad2_translated.csv', index=False)
            with open('translated_cache.json', 'w') as f:
                json.dump(translated_cache, f, indent=4)
        if idx % 1000 == 0:
            print(idx, 'price$', int(estimate_price(translated_cache)), 'error%', int(err_counter/(idx+1)*100), 'imp%', int(imp_counter/(idx+1)*100))

In [116]:
main()

  0%|          | 0/130319 [00:00<?, ?it/s]

0 price$ 461 error% 0 imp% 0
1000 price$ 461 error% 56 imp% 0
2000 price$ 461 error% 54 imp% 0
3000 price$ 461 error% 49 imp% 11
4000 price$ 461 error% 48 imp% 8
5000 price$ 461 error% 49 imp% 6
6000 price$ 461 error% 51 imp% 5
7000 price$ 461 error% 50 imp% 4
8000 price$ 461 error% 48 imp% 8
9000 price$ 461 error% 46 imp% 13
10000 price$ 461 error% 43 imp% 17
11000 price$ 461 error% 42 imp% 20
12000 price$ 461 error% 41 imp% 20
13000 price$ 461 error% 40 imp% 23
14000 price$ 461 error% 39 imp% 24
15000 price$ 461 error% 40 imp% 24
16000 price$ 461 error% 39 imp% 24
17000 price$ 461 error% 39 imp% 25
18000 price$ 461 error% 39 imp% 25
19000 price$ 461 error% 38 imp% 26
20000 price$ 461 error% 38 imp% 27
21000 price$ 461 error% 38 imp% 27
22000 price$ 461 error% 38 imp% 27
23000 price$ 461 error% 38 imp% 27
24000 price$ 461 error% 38 imp% 25
25000 price$ 461 error% 39 imp% 25
26000 price$ 461 error% 40 imp% 24
27000 price$ 461 error% 39 imp% 25
28000 price$ 461 error% 40 imp% 24
29000 p

In [118]:
squad2.to_csv('squad2_translated.csv', index=False)
with open('translated_cache.json', 'w') as f:
    json.dump(translated_cache, f, indent=4)