# Data Translation Pipeline

## Translation

In [None]:
!pip install --upgrade pip
!pip install requests --upgrade
import requests
import getpass
import pandas as pd
import os
import json
import time
from functools import lru_cache
pd.set_option('display.max_colwidth', None)

In [None]:
PREPROC_CSV_PATH = '/content/drive/MyDrive/Dissertation/data/preproc_csv'
TRANS_TXT_PATH = '/content/drive/MyDrive/Dissertation/data/trans_txt'
TRANS_CSV_PATH = '/content/drive/MyDrive/Dissertation/data/trans_csv'

In [None]:
df_snli_dev = pd.read_csv(os.path.join(PREPROC_CSV_PATH,'df_snli_dev.csv'), delimiter=";")
df_snli_train = pd.read_csv(os.path.join(PREPROC_CSV_PATH,'df_snli_train.csv'), delimiter=";")
df_snli_test = pd.read_csv(os.path.join(PREPROC_CSV_PATH,'df_snli_test.csv'), delimiter=";")
df_mnli_dev = pd.read_csv(os.path.join(PREPROC_CSV_PATH,'df_mnli_dev.csv'), delimiter=";")
df_mnli_train = pd.read_csv(os.path.join(PREPROC_CSV_PATH,'df_mnli_train.csv'), delimiter=";")

In [None]:
snli_dev_premise = df_snli_dev['premise'].tolist()
snli_train_premise = df_snli_train['premise'].tolist()
snli_test_premise = df_snli_test['premise'].tolist()
mnli_dev_premise = df_mnli_dev['premise'].tolist()
mnli_train_premise = df_mnli_train['premise'].tolist()
snli_dev_hypothesis = df_snli_dev['hypothesis'].tolist()
snli_train_hypothesis = df_snli_train['hypothesis'].tolist()
snli_test_hypothesis = df_snli_test['hypothesis'].tolist()
mnli_dev_hypothesis = df_mnli_dev['hypothesis'].tolist()
mnli_train_hypothesis = df_mnli_train['hypothesis'].tolist()

In [None]:
print(f"snli_dev_premise [{len(snli_dev_premise)}]")
print(f"snli_train_premise [{len(snli_train_premise)}]")
print(f"snli_test_premise [{len(snli_test_premise)}]")
print(f"mnli_dev_premise [{len(mnli_dev_premise)}]")
print(f"mnli_train_premise [{len(mnli_train_premise)}]")
print(f"snli_dev_hypothesis [{len(snli_dev_hypothesis)}]")
print(f"snli_train_hypothesis [{len(snli_train_hypothesis)}]")
print(f"snli_test_hypothesis [{len(snli_test_hypothesis)}]")
print(f"mnli_dev_hypothesis [{len(mnli_dev_hypothesis)}]")
print(f"mnli_train_hypothesis [{len(mnli_train_hypothesis)}]")

snli_dev_premise [9842]
snli_train_premise [549361]
snli_test_premise [9824]
mnli_dev_premise [9815]
mnli_train_premise [392659]
snli_dev_hypothesis [9842]
snli_train_hypothesis [549361]
snli_test_hypothesis [9824]
mnli_dev_hypothesis [9815]
mnli_train_hypothesis [392659]


In [None]:
# Get Current progress of file
def get_progress(filepath):
  try:
    with open(os.path.join(TRANS_TXT_PATH,filepath),'r') as current_file:
      num_lines = len(current_file.readlines())
      return num_lines
  except:
    return 0

# Flush the buffer to the file
def write_list_to_file(list_to_write, out_filepath):
  with open(out_filepath, 'a') as file:
    file.writelines([str(x)+"\n" for x in list_to_write])

# Call the TranslatePlus API
@lru_cache(maxsize=5)
def translate_api(session, text, retry = True):
  translate_url = "https://api.translateplus.io/v1/translate"
  header = {"X-API-KEY":"a22eb1566341478d30276ee39cccbe319ba454d1"}
  json_obj = {
    "text": text,
    "source": "en",
    "target": "mt"
  }

  try:
    response = session.post(translate_url, json=json_obj, headers=header).json()
    result = response['translations']['translation']
  except:
    if retry:
      time.sleep(1)
      result = translate_api(session, text, False)
    else:
      result = f"COULD NOT TRANSLATE: [{text}]"
  return result

# Translate and flush buffer
def buffer_translate(input_list, buffer_size, out_filepath):
  total_sentences = len(input_list)
  buffers_flushed = 0
  session = requests.Session()
  buffer_list = []

  for sentence in input_list:
    buffer_list.append(translate_api(session, sentence))
    if len(buffer_list) >= buffer_size:
      write_list_to_file(buffer_list, out_filepath)
      buffer_list.clear()
      buffers_flushed += 1
      print(out_filepath + ": " + str(int((100*buffers_flushed*buffer_size)/total_sentences)) + "% complete")
  else:
    write_list_to_file(buffer_list, out_filepath)

In [None]:
prog_snli_dev_premise = get_progress('snli_dev_premise.txt')
prog_snli_train_premise = get_progress('snli_train_premise.txt')
prog_snli_test_premise = get_progress('snli_test_premise.txt')
prog_mnli_dev_premise = get_progress('mnli_dev_premise.txt')
prog_mnli_train_premise = get_progress('mnli_train_premise.txt')
prog_snli_dev_hypothesis = get_progress('snli_dev_hypothesis.txt')
prog_snli_train_hypothesis = get_progress('snli_train_hypothesis.txt')
prog_snli_test_hypothesis = get_progress('snli_test_hypothesis.txt')
prog_mnli_dev_hypothesis = get_progress('mnli_dev_hypothesis.txt')
prog_mnli_train_hypothesis = get_progress('mnli_train_hypothesis.txt')

In [None]:
buffer_translate(snli_dev_premise[prog_snli_dev_premise:], 500, os.path.join(TRANS_TXT_PATH,'snli_dev_premise.txt'))
buffer_translate(snli_train_premise[prog_snli_train_premise:], 500, os.path.join(TRANS_TXT_PATH,'snli_train_premise.txt'))
buffer_translate(snli_test_premise[prog_snli_test_premise:], 500, os.path.join(TRANS_TXT_PATH,'snli_test_premise.txt'))
buffer_translate(mnli_dev_premise[prog_mnli_dev_premise:], 500, os.path.join(TRANS_TXT_PATH,'mnli_dev_premise.txt'))
buffer_translate(mnli_train_premise[prog_mnli_train_premise:], 500, os.path.join(TRANS_TXT_PATH,'mnli_train_premise.txt'))
buffer_translate(snli_dev_hypothesis[prog_snli_dev_hypothesis:], 500, os.path.join(TRANS_TXT_PATH,'snli_dev_hypothesis.txt'))
buffer_translate(snli_train_hypothesis[prog_snli_train_hypothesis:], 500, os.path.join(TRANS_TXT_PATH,'snli_train_hypothesis.txt'))
buffer_translate(snli_test_hypothesis[prog_snli_test_hypothesis:], 500, os.path.join(TRANS_TXT_PATH,'snli_test_hypothesis.txt'))
buffer_translate(mnli_dev_hypothesis[prog_mnli_dev_hypothesis:], 500, os.path.join(TRANS_TXT_PATH,'mnli_dev_hypothesis.txt'))
buffer_translate(mnli_train_hypothesis[prog_mnli_train_hypothesis:], 500, os.path.join(TRANS_TXT_PATH,'mnli_train_hypothesis.txt'))

In [None]:
def get_list_from_file(filepath):
  with open(os.path.join(TRANS_TXT_PATH,filepath),'r') as current_file:
    out_list = [x.strip("\n") for x in current_file.readlines()]
    return out_list


In [None]:
snli_dev_premise_trans = get_list_from_file('snli_dev_premise.txt')
snli_train_premise_trans = get_list_from_file('snli_train_premise.txt')
snli_test_premise_trans = get_list_from_file('snli_test_premise.txt')
mnli_dev_premise_trans = get_list_from_file('mnli_dev_premise.txt')
mnli_train_premise_trans = get_list_from_file('mnli_train_premise.txt')
snli_dev_hypothesis_trans = get_list_from_file('snli_dev_hypothesis.txt')
snli_train_hypothesis_trans = get_list_from_file('snli_train_hypothesis.txt')
snli_test_hypothesis_trans = get_list_from_file('snli_test_hypothesis.txt')
mnli_dev_hypothesis_trans = get_list_from_file('mnli_dev_hypothesis.txt')
mnli_train_hypothesis_trans = get_list_from_file('mnli_train_hypothesis.txt')

In [None]:
df_snli_dev['premise'] = snli_dev_premise_trans
df_snli_dev['hypothesis'] = snli_dev_hypothesis_trans
df_snli_train['premise'] = snli_train_premise_trans
df_snli_train['hypothesis'] = snli_train_hypothesis_trans
df_snli_test['premise'] = snli_test_premise_trans
df_snli_test['hypothesis'] = snli_test_hypothesis_trans
df_mnli_dev['premise'] = mnli_dev_premise_trans
df_mnli_dev['hypothesis'] = mnli_dev_hypothesis_trans
df_mnli_train['premise'] = mnli_train_premise_trans
df_mnli_train['hypothesis'] = mnli_train_hypothesis_trans

In [None]:
df_snli_dev.to_csv(os.path.join(TRANS_CSV_PATH, 'df_snli_dev.csv'), encoding='utf-8',sep=';')
df_snli_train.to_csv(os.path.join(TRANS_CSV_PATH, 'df_snli_train.csv'), encoding='utf-8',sep=';')
df_snli_test.to_csv(os.path.join(TRANS_CSV_PATH, 'df_snli_test.csv'), encoding='utf-8',sep=';')
df_mnli_dev.to_csv(os.path.join(TRANS_CSV_PATH, 'df_mnli_dev.csv'), encoding='utf-8',sep=';')
df_mnli_train.to_csv(os.path.join(TRANS_CSV_PATH, 'df_mnli_train.csv'), encoding='utf-8',sep=';')

In [None]:
df_snli_merged = pd.concat([df_snli_dev, df_snli_test, df_snli_train])
df_mnli_merged = pd.concat([df_mnli_dev, df_mnli_train])

In [None]:
df_snli_merged.drop(columns='Unnamed: 0', inplace=True)
df_mnli_merged.drop(columns='Unnamed: 0', inplace=True)

In [None]:
df_snli_merged.head(20)

Unnamed: 0,premise,hypothesis,classification
0,Żewġ nisa qed iħaddnu waqt li jżommu pakketti biex imorru.,Is-sorijiet qed jgħannqu addio waqt li jżommu pakketti biex imorru wara li jkunu għadhom kif jieklu.,neutral
1,Żewġ nisa qed iħaddnu waqt li jżommu pakketti biex imorru.,Żewġ nisa qed iżommu pakketti.,entailment
2,Żewġ nisa qed iħaddnu waqt li jżommu pakketti biex imorru.,L-irġiel qed jiġġieldu barra deli.,contradiction
3,"Żewġt itfal żgħar bil-flokkijiet blu, wieħed bin-numru 9 u wieħed bin-numru 2 qegħdin fuq tarġien tal-injam f’kamra tal-banju u jaħslu idejhom f’sink.",Żewġt itfal bil-flokkijiet numerati jaħslu idejhom.,entailment
4,"Żewġt itfal żgħar bil-flokkijiet blu, wieħed bin-numru 9 u wieħed bin-numru 2 qegħdin fuq tarġien tal-injam f’kamra tal-banju u jaħslu idejhom f’sink.",Żewġt itfal waqt logħba tal-ballun jaħslu idejhom.,neutral
5,"Żewġt itfal żgħar bil-flokkijiet blu, wieħed bin-numru 9 u wieħed bin-numru 2 qegħdin fuq tarġien tal-injam f’kamra tal-banju u jaħslu idejhom f’sink.",Żewġt itfal bil-ġkieket jimxu lejn l-iskola.,contradiction
6,Raġel ibigħ doughnuts lil klijent waqt avveniment ta’ wirja dinjija li saret fil-belt ta’ Angeles,Mara tixrob il-kafè tagħha f'kafetterija żgħira.,contradiction
7,Raġel ibigħ doughnuts lil klijent waqt avveniment ta’ wirja dinjija li saret fil-belt ta’ Angeles,Raġel ibigħ doughnuts lil klijent waqt avveniment ta’ wirja dinjija waqt li n-nies jistennew fil-kju warajh.,neutral
8,Raġel ibigħ doughnuts lil klijent waqt avveniment ta’ wirja dinjija li saret fil-belt ta’ Angeles,Raġel ibiegħ doughnuts lil klijent.,entailment
9,"Żewġ subien ta’ timijiet opposti jilagħbu l-futbol, ​​filwaqt li lebsin uniformijiet ta’ protezzjoni sħiħa u elmi.",subien jilagħbu futbol,entailment


In [None]:
df_snli_merged.tail(50)

Unnamed: 0,premise,hypothesis,classification
549311,Diversi adulti qed jieħdu ħsieb lil uliedhom barra f’ġurnata xemxija.,L-adulti għandhom it-tfal.,entailment
549312,Diversi adulti qed jieħdu ħsieb lil uliedhom barra f’ġurnata xemxija.,In-nies qed jagħmlu picnic.,neutral
549313,"Grupp ta’ tfal, subien u bniet, lebsin libsa tal-kor blu qegħdin bilwieqfa quddiem udjenza.",żewġ subien jippritkaw lil grupp ta’ nagħaġ,contradiction
549314,"Grupp ta’ tfal, subien u bniet, lebsin libsa tal-kor blu qegħdin bilwieqfa quddiem udjenza.",isir prestazzjoni tal-kor tal-knisja,neutral
549315,"Grupp ta’ tfal, subien u bniet, lebsin libsa tal-kor blu qegħdin bilwieqfa quddiem udjenza.",xi tfal bil-libes tal-kor joqogħdu ħdejn nies oħra,entailment
549316,Grupp ta’ tfal żgħar lebsin libsa blu qegħdin quddiem folla b’idejhom maħżuna.,It-tfal qed jorqdu.,contradiction
549317,Grupp ta’ tfal żgħar lebsin libsa blu qegħdin quddiem folla b’idejhom maħżuna.,It-tfal qegħdin fil-knisja.,neutral
549318,Grupp ta’ tfal żgħar lebsin libsa blu qegħdin quddiem folla b’idejhom maħżuna.,Grupp ta’ adulti jkantaw lill-folla.,contradiction
549319,Grupp ta’ tfal żgħar lebsin libsa blu qegħdin quddiem folla b’idejhom maħżuna.,it-tfal huma d-dar jorqdu,contradiction
549320,Grupp ta’ tfal żgħar lebsin libsa blu qegħdin quddiem folla b’idejhom maħżuna.,xi tfal bilwieqfa,entailment


In [None]:
df_mnli_merged.head(5)

Unnamed: 0,premise,hypothesis,classification
0,Id-drittijiet il-ġodda huma sbieħ biżżejjed,Kulħadd verament jħobb l-aktar benefiċċji ġodda,neutral
1,Dan is-sit jinkludi lista tar-rebbieħa kollha tal-premjijiet u database ta’ tiftix ta’ artikli tal-Eżekuttiv tal-Gvern.,L-artikoli tal-Eżekuttiv tal-Gvern li jinsabu fuq il-websajt ma jistgħux jiġu mfittxija.,contradiction
2,uh ma nafx għandi emozzjonijiet imħallta dwaru uh kultant jogħġobni imma fl-istess ħin inħobb nara lil xi ħadd iħabbatlu,"Jogħġobni fil-biċċa l-kbira tiegħu, iżda xorta nieħu gost nara lil xi ħadd isawwatu.",entailment
3,iva naħseb li r-restorant favorit tiegħi dejjem kien l-eqreb li taf l-eqreb sakemm ikun jissodisfa l-kriterji minimi li taf ta' ikel tajjeb,Ir-ristoranti favoriti tiegħi huma dejjem mill-inqas mitt mil bogħod mid-dar tiegħi.,contradiction
4,ma nafx um tagħmel ħafna kampeġġ,Naf eżattament.,contradiction


In [None]:
df_mnli_merged.tail(5)

Unnamed: 0,premise,hypothesis,classification
392654,"B'mod ċar, California tista' - u trid - tagħmel aħjar.",California ma tistax tagħmel aħjar.,contradiction
392655,"Darba kienet meqjusa bħala l-isbaħ triq fl-Ewropa, dikjarazzjoni li diffiċli tifhem illum peress li tant mill-bini oriġinali ġie sostitwit.",Allura ħafna mill-bini oriġinali kienu ġew sostitwiti minn ħwienet tal-konvenjenza.,neutral
392656,Il-houseboats huma tradizzjoni preservata b'mod sabiħ tal-aqwa żmien tar-Raj Brittaniku.,It-tradizzjoni tal-houseboats oriġinat waqt li l-British Raj kien għadu għaddej b'saħħtu.,entailment
392657,L-obituaries fakkru b’mod qalb id-dibattiti tiegħu fuq l-arja u żewġ tislima b’riżerva kbira mar-reviżur sħabu Roger Ebert fuq il-programm televiżiv sindakat eponimu tagħhom.,L-obituaries kienu sbieħ u miktuba in natura dwar il-kisbiet tiegħu fl-industrija tad-divertiment.,neutral
392658,f'dik l-oħra taf uh li għandi nagħmilha jew dik jew sempliċement biex naħseb biex nagħmilha ratha milli jkolli xi ħadd jgħidlu biex jagħmel dan naf li kienet ħaġa kbira fid-dar tagħna għal żmien twil kienet li jekk ridt tiegħi ir-raġel biex jagħmel xi ħaġa biex jgħin,Dan l-aħħar żewġi tant kien xogħol żżejjed li ma nistax nistaqsih biex jagħmel ħafna madwar hawn.,neutral


In [None]:
df_mnli_merged.to_csv(os.path.join(TRANS_CSV_PATH, 'df_mnli_translated.csv'), encoding='utf-8',sep=';')
df_snli_merged.to_csv(os.path.join(TRANS_CSV_PATH, 'df_snli_translated.csv'), encoding='utf-8',sep=';')