# Reproducing API Results and Model Results

In this notebook we will go over how to reproduce our results for the [Google API translation](#google-api-translation), [Azure translation](#azure-translation) and [AWS API translation](#aws-translation). For recreating model results section please click [here](#recreating-model-results). For a summary of all results please go to [bottom of the notebook](#summary-of-results).

In [None]:
%pip install datasets
%pip install evaluate

In [None]:
import os
from google.cloud import translate
import pickle
import evaluate
from datasets import load_dataset
import requests, uuid, json
import pandas as pd
#need to set os environment variable with google applioation credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/credentials.json"

In [None]:
#getting our datasets, I would recommend downloading them and loading them locally
#these are from huggingface datasets
dataset_wmt_enfr = load_dataset("wmt14",'fr-en', split='test')
dataset_wmt_ende = load_dataset("wmt16",'de-en', split='test')

print(dataset_wmt_ende[0]['translation']['de'])
print(dataset_wmt_enfr[0]['translation']['fr'])


In [None]:
#create our corpuses in arrays (simpler view)
to_translate_wmt14_en = []
to_translate_wmt16_en = []

for i in range(len(dataset_wmt_enfr)):
    to_translate_wmt14_en.append(dataset_wmt_enfr[i]['translation']['en'])
for i in range(len(dataset_wmt_ende)):
    to_translate_wmt16_en.append(dataset_wmt_ende[i]['translation']['en'])


## Google API Translation

In [None]:
#creating our translation function from google api advanced translation v3
def translate_text(source_lang="en-US", target_lang="fr", text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"):
    """Translating Text."""
    #this uses google credentials from the environment variable
    client = translate.TranslationServiceClient()

    location = "global"

    parent = f"projects/{project_id}/locations/{location}"

    # Translate text from English to French
    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": source_lang, #was "en-US"
            "target_language_code": target_lang, #was "fr"
        }
    )

    return response.translations[0].translated_text

In [None]:
#creating our candidate corpuses and reference corpuses
candidate_wmt14_fr_google = []
candidate_wmt16_de_google = []
reference_wmt14_fr = []
reference_wmt16_de = []

In [None]:
#this next block will take time as we are making api calls
#is it possible to send batch requests?

for i in range(len(to_translate_wmt14_en)):
    candidate_wmt14_fr_google.append(translate_text(target_lang="fr", text=to_translate_wmt14_en[i]))
    reference_wmt14_fr.append(dataset_wmt_enfr[i]['translation']['fr'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt14_en) + " done")

In [None]:
#same as above for wmt16
for i in range(len(to_translate_wmt16_en)):
    candidate_wmt16_de_google.append(translate_text(target_lang="de", text=to_translate_wmt16_en[i]))
    reference_wmt16_de.append(dataset_wmt_ende[i]['translation']['de'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt16_en) + " done")

In [None]:
#get bleu from evaluate and get scores
bleu_wmt14_fr_google = evaluate.bleu(candidate_wmt14_fr, reference_wmt14_fr)
bleu_wmt16_de_google = evaluate.bleu(candidate_wmt16_de, reference_wmt16_de)

#print bleu scores
print("BLEU score for WMT14 English to French: ", bleu_wmt14_fr)
print("BLEU score for WMT16 English to German: ", bleu_wmt16_de)

## Azure Translation

In [None]:
#our azure translation function
def azure_translate_text(text, target_language):
    #the APi used https://azure.microsoft.com/en-us/products/cognitive-services/translator/
    #the free tier is sufficient for our needs
    key = "your key" #THIS MUST BE CHANGED  
    endpoint = "https://api.cognitive.microsofttranslator.com/"
    location = "enter location" #THIS MUST BE CHANGED

    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'from': 'en', #can be changed in needed
        'to': target_language
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    body = [{
        'text': text
    }]
    request = requests.post(constructed_url, params=params, headers=headers, json=body)
    response = request.json()

    return response[0]['translations'][0]['text']

In [None]:
#creating our candidate corpuses
candidate_wmt14_fr_azure = []
# reference_wmt14_fr = []

for i in range(len(to_translate_wmt14_en)):
    candidate_wmt14_fr_azure.append(azure_translate_text(to_translate_wmt14_en[i], 'fr'))
    #reference_wmt14_fr.append(dataset_wmt_enfr[i]['translation']['fr'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt14_en) + " done")



In [None]:
#same as above for wmt16
candidate_wmt16_de_azure = []
# reference_wmt16_de = []
for i in range(len(to_translate_wmt16_en)):
    candidate_wmt16_de_azure.append(azure_translate_text(to_translate_wmt16_en[i], 'de'))
    #reference_wmt16_de.append(dataset_wmt_ende[i]['translation']['de'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt16_en) + " done")


In [None]:
#get our bleu scores
bleu_wmt14_fr_azure = evaluate.bleu(candidate_wmt14_fr_azure, reference_wmt14_fr)
bleu_wmt16_de_azure = evaluate.bleu(candidate_wmt16_de_azure, reference_wmt16_de)

#print bleu scores
print("BLEU score for WMT14 English to French: ", bleu_wmt14_fr_azure)
print("BLEU score for WMT16 English to German: ", bleu_wmt16_de_azure)

## AWS Translation

This section is actually m,ostly done on AWS, so we will just go over how to reproduce the results. We utilize the batch translation on amazon so we must store our data in an S3 bucket.

We must set up our data in a batch translation friendly format. For our case we simply convert our data into an excel file

In [None]:
#converting to translate arrays to df
#we will put all to translate arrays in seperate columns in same df
df_to_translate = pd.DataFrame()
df_to_translate['wmt14_en'] = to_translate_wmt14_en
df_to_translate['wmt16_en'] = to_translate_wmt16_en

df.head()

In [None]:
#now we save as an excel file
df.to_excel("to_translate.xlsx")


One we saved the file. We must create a new bucket and upload the excel file to our S3 bucket. 

I recommend viewing [this](https://www.youtube.com/watch?v=uS_2GJh3TsY&t=542s) video for a step by step guide on how to complete this task.


Now we load back in our  data and complete similar steps as before. 

In [None]:
#load in xlsx file thgat was translated
df_translated = pd.read_excel("translated.xlsx")

In [None]:
#now we create our candidate corpuses
candidate_wmt14_fr_translated_aws = df_translated['wmt14_fr_aws'].tolist()

In [None]:
#same as above
candidate_wmt16_de_translated_aws = df_translated['wmt16_de'].tolist()


In [None]:
#calculate bleu scores

bleu_wmt14_fr_translated_aws = evaluate.bleu(candidate_wmt14_fr_translated_aws, reference_wmt14_fr)
bleu_wmt16_de_translated_aws = evaluate.bleu(candidate_wmt16_de_translated_aws, reference_wmt16_de)

#print bleu scores
print("BLEU score for WMT14 English to French: ", bleu_wmt14_fr_translated_aws)
print("BLEU score for WMT16 English to German: ", bleu_wmt16_de_translated_aws)

# Recreating Model Results

For this we will need facebook's fairseq library. We will clone the repo and cd into it.

In [None]:
%git clone https://github.com/pytorch/fairseq.git
%cd fairseq/
%pip install --editable ./
%pip install sacremoses
%pip install subword-nmt

In [None]:
#I would recommend using a torch with cuda support
import torch

## Model 1 
This model can be found [here](https://arxiv.org/abs/1705.03122)

In [None]:
#our first model is for wmt14 en-fr from fairseq. It is a convolutional seq2seq model
#the paper can be found here https://arxiv.org/abs/1705.03122
m1_en2fr = torch.hub.load('pytorch/fairseq', 'conv.wmt14.en-fr',
                       tokenizer='moses', bpe='subword_nmt')
m1_en2fr.eval()

In [None]:
m1_en2fr.cuda()

In [None]:
#create our candidate corpuse
candidate_wmt14_fr_m1 = []
#reference_wmt14_fr = []

for i in range(len(to_translate_wmt14_en)):
    candidate_wmt14_fr_m1.append(m1_en2fr.translate(to_translate_wmt14_en[i]))
    #reference_wmt14_fr.append(dataset_wmt_enfr[i]['translation']['fr'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt14_en) + " done")

In [None]:
#calculate bleu score
bleu_wmt14_fr_m1 = evaluate.bleu(candidate_wmt14_fr_m1, reference_wmt14_fr)

#print bleu score
print("BLEU score for WMT14 English to French: ", bleu_wmt14_fr_m1)

## Model 2
This model can be found [here](https://arxiv.org/abs/1806.00187)

In [None]:
#this is a seq2dew with fast training abilities

m2_en2fr = torch.hub.load('pytorch/fairseq', 'transformer.wmt14.en-fr',
                       tokenizer='moses', bpe='subword_nmt')
m2_en2fr.eval()

In [None]:
m2_en2fr.cuda()

In [None]:
#create our candidate corpuse
candidate_wmt14_fr_m2 = []
#reference_wmt14_fr = []

for i in range(len(to_translate_wmt14_en)):
    candidate_wmt14_fr_m2.append(m2_en2fr.translate(to_translate_wmt14_en[i]))
    #reference_wmt14_fr.append(dataset_wmt_enfr[i]['translation']['fr'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt14_en) + " done")

In [None]:
#calculate bleu score
bleu_wmt14_fr_m2 = evaluate.bleu(candidate_wmt14_fr_m2, reference_wmt14_fr)

#print bleu score
print("BLEU score for WMT14 English to French: ", bleu_wmt14_fr_m2)

This model was also trained for wmt16 en to de

In [None]:
m2_en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de',
                       tokenizer='moses', bpe='subword_nmt')
m2_en2de.eval()

In [None]:
m2_en2de.cuda()

In [None]:
#create our candidate corpuse
candidate_wmt16_de_m2 = []
#reference_wmt16_de = []

for i in range(len(to_translate_wmt16_en)):
    candidate_wmt16_de_m2.append(m2_en2de.translate(to_translate_wmt16_en[i]))
    #reference_wmt16_de.append(dataset_wmt_ende[i]['translation']['de'])
    #log progress
    if i % 100 == 0:
        print(i + " out of " + len(to_translate_wmt16_en) + " done")

In [None]:
#calculate bleu score
bleu_wmt16_de_m2 = evaluate.bleu(candidate_wmt16_de_m2, reference_wmt16_de)

#print bleu score
print("BLEU score for WMT16 English to German: ", bleu_wmt16_de_m2)

# Summary of Results
An ouput for all the data in one spot

In [None]:
#summerizing our results

print("BLEU score for Google API on WMT14 English to French: ", bleu_wmt14_fr_google)
print("BLEU score for Google API on WMT16 English to German: ", bleu_wmt16_de_google)

print("BLEU score for Azure API on WMT14 English to French: ", bleu_wmt14_fr_azure)
print("BLEU score for Azure API on WMT16 English to German: ", bleu_wmt16_de_azure)

print("BLEU score for AWS API on WMT14 English to French: ", bleu_wmt14_fr_translated_aws)
print("BLEU score for AWS API on WMT16 English to German: ", bleu_wmt16_de_translated_aws)

print("BLEU score for Fairseq Convolutional (Model 1) on WMT14 English to French: ", bleu_wmt14_fr_m1)

print("BLEU score for Fairseq Transformer (Model 2) on WMT14 English to French: ", bleu_wmt14_fr_m2)
print("BLEU score for Fairseq Transformer (Model 2) on WMT16 English to German: ", bleu_wmt16_de_m2)