In [62]:
translated_cache = {}

In [122]:
import os
import json
import pandas as pd
import numpy as np
import nltk.data
from tqdm.notebook import tqdm
import time
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/yuchen/kaggle/chaii/working/translate/indic-trans-0ba75612c41b.json"

def retry(fun, max_tries=10):
    for i in range(max_tries):
        try:
            time.sleep(0.3) 
            fun()
            break
        except Exception:
            continue

def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)

#     print(u"Text: {}".format(result["input"]))
#     print(u"Translation: {}".format(result["translatedText"]))
#     print(u"Detected source language: {}".format(result["detectedSourceLanguage"]))
    return result["translatedText"]

def translate_text_use_cache(target, text):
    if text in translated_cache:
        return translated_cache[text]
    text_translated = translate_text(target, text)
    translated_cache[text] = text_translated
    return text_translated

def split2sentence(paragraph):
    slist = tokenizer.tokenize(paragraph)
    sentence_lengths = [len(x)+1 for x in slist]
    if np.sum(sentence_lengths)-1 != len(paragraph):
        return None, None
    sentence_starts = [0]
    for sl in sentence_lengths[:-1]:
        sentence_starts.append(sentence_starts[-1]+sl)
    return slist, sentence_starts

def get_sentence_idx(sentence_starts, answer_start_char_idx):
    n_sentence = len(sentence_starts)
    for i, sidx in enumerate(sentence_starts):
        if i == n_sentence-1:
            return i
        if answer_start_char_idx >= sidx and answer_start_char_idx < sentence_starts[i+1]:
            return i

def join_slist(slist, sentence_idx, char_idx):
    cursor = 0
    sentence_lengths = [len(x)+1 for x in slist]
    for i in range(sentence_idx):
        cursor += sentence_lengths[i]
    return ' '.join(slist), cursor+char_idx
        
def translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target="ta", verbose=False):
    slist, sentence_starts = split2sentence(context)
    if slist is None:
        if verbose: print('tokenized length not match')
        return 'tokenized length not match'
    start_char_sentence_idx = get_sentence_idx(sentence_starts, answer_start_char_idx)
    end_char_sentence_idx = get_sentence_idx(sentence_starts, answer_end_char_idx)
    if start_char_sentence_idx != end_char_sentence_idx:
        if verbose: print('start and end not in same sentence')
        return 'start and end not in same sentence'
    sentence_containing_answer = slist[start_char_sentence_idx]
    if sentence_containing_answer.count(answer) != 1:
        if verbose: print('answer does not occur once in the original sentence')
        return 'answer does not occur once in the original sentence'
    sentence_containing_answer_translated = translate_text_use_cache(target, sentence_containing_answer)
    answer_translated = translate_text_use_cache(target, answer)
    answer_occurence = sentence_containing_answer_translated.count(answer_translated)
    if answer_occurence != 1:
        if verbose: print('answer does not occur once in the translated sentence')
        return 'answer does not occur once in the translated sentence'
    answer_insentence_idx = sentence_containing_answer_translated.find(answer_translated)
    slist_translated = ['' for _ in range(len(slist))]
    slist_translated[start_char_sentence_idx] = sentence_containing_answer_translated
    for i, s in enumerate(slist):
        if i == start_char_sentence_idx: continue
        slist_translated[i] = translate_text_use_cache(target, s)
    context_translated, answer_start_char_idx_translated = join_slist(slist_translated, start_char_sentence_idx, answer_insentence_idx)
    question_translated = translate_text_use_cache(target, question)
    return {
            'context': context_translated,
            'question': question_translated,
            'answer_text': answer_translated,
            'answer_start': answer_start_char_idx_translated,
    }

def estimate_price(translated_cache):
    unit_price = 20 / 1000000
    token_count = np.sum([len(x) for x in list(translated_cache.keys())])
    return token_count * unit_price

In [138]:
squad = pd.read_csv('../../input/squad1/SQuAD-v1.2.csv')
squad

Unnamed: 0,title,context,question,answer,answer_start_char_idx,answer_end_char_idx,answer_start_word_idx,answer_end_word_idx
0,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,515,541,102.0,104.0
1,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,188,213,37.0,41.0
2,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building,279,296,57.0,59.0
3,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection,381,420,76.0,82.0
4,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary,92,126,17.0,23.0
...,...,...,...,...,...,...,...,...
87594,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what US state did Kathmandu first establish...,Oregon,229,235,38.0,38.0
87595,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",What was Yangon previously known as?,Rangoon,414,421,71.0,71.0
87596,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,476,481,85.0,85.0
87597,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,199,203,31.0,31.0


In [139]:
idx = 4
context = squad.loc[idx, 'context']
question = squad.loc[idx, 'question']
answer = squad.loc[idx, 'answer']
answer_start_char_idx = squad.loc[idx, 'answer_start_char_idx']
answer_end_char_idx = squad.loc[idx, 'answer_end_char_idx']

In [140]:
res = translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target="ta")
res

{'context': 'கட்டிடக்கலைப்படி, பள்ளியில் கத்தோலிக்கத் தன்மை உள்ளது. பிரதான கட்டிடத்தின் தங்க குவிமாடத்தின் மேல் கன்னி மேரியின் தங்க சிலை உள்ளது. பிரதான கட்டிடத்தின் முன்னால் மற்றும் அதன் எதிரே, கிறிஸ்துவின் செப்பு சிலை உள்ளது &quot;வெனிட் ஆட் மீ ஓம்னெஸ்&quot; என்ற புராணக்கதையுடன் கைகளை உயர்த்தி உள்ளது. பிரதான கட்டிடத்திற்கு அடுத்தபடியாக புனித இதயத்தின் பசிலிக்கா உள்ளது. பசிலிக்காவுக்குப் பின்னால் கிரோட்டோ உள்ளது, இது மரியன் பிரார்த்தனை மற்றும் பிரதிபலிப்பு இடம். இது பிரான்சின் லூர்துஸில் உள்ள கிரோட்டோவின் பிரதி ஆகும், அங்கு கன்னி மேரி 1858 இல் புனித பெர்னாடெட் சbபிரஸுக்கு புகழ்பெற்றார். பிரதான உந்துதலின் முடிவில் (மற்றும் 3 சிலைகள் மற்றும் கோல்டு டோம் மூலம் இணைக்கும் நேரடி வரியில்), மேரியின் எளிய, நவீன கல் சிலை உள்ளது.',
 'question': 'நோட்ரே டேமில் உள்ள பிரதான கட்டிடத்தின் மேல் என்ன இருக்கிறது?',
 'answer_text': 'கன்னி மேரியின் தங்க சிலை',
 'answer_start': 99}

In [141]:
def main():
    max_retry = 3
    target = 'ta'
    err_counter = 0
    for idx in tqdm(range(len(squad))):
        context = squad.loc[idx, 'context']
        question = squad.loc[idx, 'question']
        answer = squad.loc[idx, 'answer']
        answer_start_char_idx = squad.loc[idx, 'answer_start_char_idx']
        answer_end_char_idx = squad.loc[idx, 'answer_end_char_idx']
        for i in range(max_retry):
            try:
                res = translate_and_map(context, question, answer, answer_start_char_idx, answer_end_char_idx, target=target, verbose=False)
                break
            except:
                print(f'error encountered at step {idx}, retry counter {i}')
                res = 'network error'
                continue
        if isinstance(res, str):
            squad.loc[idx, f'error_{target}'] = res
            err_counter += 1
        else:
            squad.loc[idx, 'error'] = 'success'
            squad.loc[idx, f'context_{target}'] = res['context']
            squad.loc[idx, f'question_{target}'] = res['question']
            squad.loc[idx, f'answer_text_{target}'] = res['answer_text']
            squad.loc[idx, f'answer_start_{target}'] = res['answer_start']
        if idx % 50 == 0:
            squad.to_csv('squad1.2_translated.csv', index=False)
            with open('translated_cache.json', 'w') as f:
                json.dump(translated_cache, f, indent=4)
        if idx % 1000 == 0:
            print(idx, 'price$', int(estimate_price(translated_cache)), 'error%', int(err_counter/(idx+1)*100))

In [None]:
main()

  0%|          | 0/87599 [00:00<?, ?it/s]

0 price$ 248 error% 100
1000 price$ 248 error% 54
2000 price$ 248 error% 56
3000 price$ 248 error% 56
4000 price$ 248 error% 55
5000 price$ 248 error% 53
6000 price$ 248 error% 54
7000 price$ 248 error% 55
8000 price$ 248 error% 54
9000 price$ 248 error% 55
10000 price$ 248 error% 54
11000 price$ 248 error% 55
12000 price$ 248 error% 54
13000 price$ 248 error% 53
14000 price$ 248 error% 54
15000 price$ 248 error% 53
16000 price$ 248 error% 54
17000 price$ 248 error% 54
18000 price$ 248 error% 53
19000 price$ 248 error% 53
20000 price$ 248 error% 53
21000 price$ 248 error% 54
22000 price$ 248 error% 54
23000 price$ 248 error% 54
24000 price$ 248 error% 54
25000 price$ 248 error% 54
26000 price$ 248 error% 54
