In [34]:
import pandas as pd

df_wmt14_en = pd.read_csv('data/wmt14_english_data.csv', header=None)
df_wmt14_en.head()

Unnamed: 0,0
0,Spectacular Wingsuit Jump Over Bogota
1,Sportsman Jhonathan Florez jumped from a helic...
2,"Wearing a wingsuit, he flew past over the famo..."
3,A black box in your car?
4,As America's road planners struggle to find th...


In [35]:
df_wmt16_en = pd.read_csv('data/wmt16_english_data.csv', header=None)
df_wmt16_en.head()

Unnamed: 0,0
0,Obama receives Netanyahu
1,The relationship between Obama and Netanyahu i...
2,The two wanted to talk about the implementatio...
3,The meeting was also planned to cover the conf...
4,Relations between Obama and Netanyahu have bee...


In [3]:
"""
Retreived from https://github.com/kjanjua26/Pyphones/tree/master
Wen have modified this code for our needs
Python wrapper for the website: https://www.homophone.com/
Gets the homophones of a word.
"""

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from typing import Dict, List
import re

class Pyphones:
    
    def __init__(self, word):
        self.word = word
        self.url = "https://www.homophone.com/search?page={}&type=&q={}"
        self.homophones = {self.word: []}
        
    def get_the_page(self, page_no=1):
        """
        Get the page content.

        Returns
            str: the content of the page.
        """
        url = self.url.format(page_no, self.word)
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "html.parser")
        return soup

    def get_the_page_nos(self):
        """
        Get the total number of pages

        Returns
            int: the total number of the pages.
        """
        soup = self.get_the_page()
        pages = soup.find_all('div', attrs={'class':'col-sm-9'})
        total_pages = pages[0].find('h5').text.split('/')[-1].strip()
        return int(total_pages)

    def get_the_homophones(self):
        """
        Get the homophones of the word.

        Returns
            dict: {word: [list_of_homophones]} against each word.
        """
        total_pages = self.get_the_page_nos()
        for ix in range(total_pages):
            page_no = ix + 1
            soup = self.get_the_page(page_no)
            raw_homophones = soup.find_all('div', attrs={'class': 'well well-lg'})
            for elem in range(len(raw_homophones)):
                raw_homophones_2 = raw_homophones[elem].find_all('a', attrs={'class': 'btn word-btn'})
                list_of_homophones = list(raw_homophones_2)
                if any(list_of_homophones):
                    local_homophones = []
                    for tag_of_homophone in list_of_homophones:
                        homophone = tag_of_homophone.text
                        local_homophones.append(homophone)
                    self.homophones[self.word].append(local_homophones)

        #we want to modify the ouput to only the list within the list that the firsrt word is the same as the input word. 
        for key in self.homophones.keys():
            for i in range(len(self.homophones[key])):
                if self.homophones[key][i][0] == key:
                    return self.homophones[key].pop(i)
        return []

In [4]:
out = Pyphones("there")
homophones = out.get_the_homophones()
print(homophones)

['there', 'their', "they're"]


In [5]:
#lets make a dictornay of all unique words in both datasets and their homophones
#we will use the wmt14 dataset nad wmt16 as the base

#first we will make a dict of all unique words in both datasets

words = {}

for i in range(len(df_wmt14_en)):
    for j in range(len(df_wmt14_en.iloc[i,0].split())):
        #make sure to lower case all words and remove punctuation at the end of the word and the beginning
        words[df_wmt14_en.iloc[i,0].split()[j].lower().strip('".,?!:;()')] = []       

for i in range(len(df_wmt16_en)):
    for j in range(len(df_wmt16_en.iloc[i,0].split())):
        #make sure to lower case all words and remove punctuation at the end of the word and the beginning
        words[df_wmt16_en.iloc[i,0].split()[j].lower().strip('".,?!:;()')] = []   

print(len(words))
#print some keys of words
print(list(words.keys())[0:15])


14899
['spectacular', 'wingsuit', 'jump', 'over', 'bogota', 'sportsman', 'jhonathan', 'florez', 'jumped', 'from', 'a', 'helicopter', 'above', 'the', 'capital']


In [7]:
out = Pyphones("a").get_the_homophones()

print(out)


['a', 'uh']


In [7]:
#now we will add the homophones to the words dict
counter = 0
for key in words.keys():
    # we need to try and except because some words dont have homophones
    try:
        words[key] = Pyphones(key).get_the_homophones()
        print("Homophones for " + key + ": " + str(words[key]))
    except:
        words[key] = []
        #print("No homophones for: " + str(key) +str(words[key]))
    #lets update the progress every 1000 words
    if counter % 100 == 0:
        #also lets print the word we are on and its homophones
        print("----" + str(counter) + "/" + str(len(words))+ "----") 
    counter += 1


----0/14899----
Homophones for over: []
Homophones for a: ['a', 'uh']
Homophones for the: ['the', 'thee']
Homophones for capital: ['capital', 'capitol']
Homophones for of: []
Homophones for on: []
Homophones for he: []
Homophones for flew: ['flew', 'flu', 'flue']
Homophones for past: ['past', 'passed']
Homophones for at: []
Homophones for is: []
Homophones for an: []
Homophones for and: []
Homophones for there: ['there', 'their', "they're"]
Homophones for to: ['to', 'too', 'two']
Homophones for his: []
Homophones for box: ['box', 'bocks']
Homophones for in: ['in', 'inn']
Homophones for your: ['your', 'yore', "you're"]
Homophones for car: []
Homophones for as: ['as', 'ass', 'asse']
Homophones for road: ['road', 'rode', 'rowed']
Homophones for find: ['find', 'fined']
Homophones for cash: ['cash', 'cache']
Homophones for are: ['are', 'air', 'aire', 'ayre', 'ere', 'err', 'eyre', 'heir']
Homophones for see: ['see', 'c', 'cee', 'sea']
Homophones for by: ['by', 'bye', 'bi', 'buy']
Homophones 

In [8]:
#save the words dict to a pickle file
import pickle
with open('data/homophones_dict.pickle', 'wb') as handle:
    pickle.dump(words, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [9]:
words

{'spectacular': [],
 'wingsuit': [],
 'jump': [],
 'over': [],
 'bogota': [],
 'sportsman': [],
 'jhonathan': [],
 'florez': [],
 'jumped': [],
 'from': [],
 'a': ['a', 'uh'],
 'helicopter': [],
 'above': [],
 'the': ['the', 'thee'],
 'capital': ['capital', 'capitol'],
 'of': [],
 'colombia': [],
 'on': [],
 'thursday': [],
 'wearing': [],
 'he': [],
 'flew': ['flew', 'flu', 'flue'],
 'past': ['past', 'passed'],
 'famous': [],
 'monserrate': [],
 'sanctuary': [],
 'at': [],
 '160km/h': [],
 'is': [],
 'located': [],
 'an': [],
 'altitude': [],
 '3000': [],
 'meters': [],
 'and': [],
 'numerous': [],
 'spectators': [],
 'had': [],
 'gathered': [],
 'there': ['there', 'their', "they're"],
 'to': ['to', 'too', 'two'],
 'watch': [],
 'his': [],
 'exploit': [],
 'black': [],
 'box': ['box', 'bocks'],
 'in': ['in', 'inn'],
 'your': ['your', 'yore', "you're"],
 'car': [],
 'as': ['as', 'ass', 'asse'],
 "america's": [],
 'road': ['road', 'rode', 'rowed'],
 'planners': [],
 'struggle': [],
 'fi

In [10]:
#lets create a dictionary of all letters and the letters that they could be confused with
#for example, the letter "a" could be confused with the letter "e" because they look similar, same with b and d, etc.

confusing_letters = {"a": ["e", "i", "o", "u"],
            "b": ["d", "p", "q"],
            "c": ["e", "o", "s"],
            "d": ["b", "p", "q"],
            "e": ["a", "c", "o"],
            "f": ["t"],
            "g": ["q"],
            "h": ["n"],
            "i": ["j", "l", "o", "u"],
            "j": ["i", "l"],
            "k": ["x"],
            "l": ["i", "j"],
            "m": ["n"],
            "n": ["h", "m", "u"],
            "o": ["a", "c", "e"],
            "p": ["b", "d", "q"],
            "q": ["b", "d", "g", "p"],
            "r": ["n"],
            "s": ["c"],
            "t": ["f"],
            "u": ["i", "v", "n"],
            "v": ["u"],
            "w": ["v"],
            "x": ["k"],
            "y": ["v"],
            "z": ["s"]}
#this list was created by me and copied from the following website: https://www.grammarly.com/blog/confusing-letters/

In [16]:
#lets save the confusing letters dict to a pickle file
with open('data/confusing_letters_dict.pickle', 'wb') as handle:
    pickle.dump(confusing_letters, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [18]:
#lets load in our data from a csv file
import pandas as pd
df_wmt14_en = pd.read_csv("data/wmt14_english_data.csv", header=None)
df_wmt16_en = pd.read_csv("data/wmt16_english_data.csv", header=None)

df_wmt14_en.head()

Unnamed: 0,0
0,Spectacular Wingsuit Jump Over Bogota
1,Sportsman Jhonathan Florez jumped from a helic...
2,"Wearing a wingsuit, he flew past over the famo..."
3,A black box in your car?
4,As America's road planners struggle to find th...


In [55]:
#we now need to define a function that with a probability p1, we will swap a word with a homophone and a probability p2, we will swap a letter with a confusing letter

#we also want to return the amount of words that were swapped, and the amount of letters that were swapped, and the total amount of words changed
import random

def injection_swap(sentence, homophones_dict, confusing_letters_dict, p1=0, p2=0):
    #first we need to split the sentence into a list of words
    words = sentence.split()
    #we need to keep track of the amount of words that were swapped
    words_swapped = 0
    #we also need to keep track of the amount of letters that were swapped
    letters_swapped = 0
    #we also need to keep track of the amount of words that were changed
    words_changed = 0
    for i in range(len(words)):
        #if the word has any punctuation or capitalization, we skip it
        if words[i].isalpha() == False:
            continue
        if words[i].islower() == False:
            continue
        word = words[i].lower().strip('".,?!:;()')
        #we need to check if the word is in the homophones dict
        if word in homophones_dict.keys():
            #we need to check if the word has any homophones
            if len(homophones_dict[word]) > 0:
                #we need to check if we will swap the word with a homophone with probability p1
                if random.random() < p1:
                    #we need to pick a random homophone from the list of homophones
                    homophone = random.choice(homophones_dict[word])
                    #we need to swap the word with the homophone
                    words[i] = homophone
                    #we need to update the amount of words that were swapped
                    words_swapped += 1
                    #we need to update the amount of words that were changed
                    words_changed += 1
        for j in range(len(word)):
            #we need to check if we will swap a letter with a confusing letter
            if random.random() < p2:
                #we need to check if the word is in the confusing letters dict
                if word[j] in confusing_letters_dict.keys():
                    #we need to pick a random letter from the list of confusing letters
                    confusing_letter = random.choice(confusing_letters_dict[word[j]])
                    
                    #replace the letter with the confusing letter at index j
                    word = word[:j] + confusing_letter + word[j+1:]

                    words[i] = word
                    #we need to update the amount of letters that were swapped
                    letters_swapped += 1
                    #we need to update the amount of words that were changed
                    words_changed += 1
    #we need to join the list of words back into a sentence
    sentence = " ".join(words)
    return sentence, (words_swapped, letters_swapped, words_changed)




In [68]:
#lets test our function
sentence = "I am a sentence that has some words that will be swapped with homophones and some letters that will be swapped with confusing letters."
print(sentence)
sentence, results = injection_swap(sentence, words, confusing_letters, p1=0.5, p2=0.5)
print(sentence)
print("Words swapped: " + str(results[0]))
print("Letters swapped: " + str(results[1]))
print("Words changed: " + str(results[2]))


I am a sentence that has some words that will be swapped with homophones and some letters that will be swapped with confusing letters.
I am uh scnfeuce fnef nuc came wonds tnet viil b svappod vlfn homeqnomoc unq same lcttorc fnif wiji bo cvappcq wutn confisung letters.
Words swapped: 3
Letters swapped: 50
Words changed: 53


In [62]:
#function that takes in a dataframe and uses the injection_swap function to swap words and letters

def injection_swap_df(df, homophones_dict, confusing_letters_dict, p1=0, p2=0):

    #we need to keep track of the amount of words that were swapped
    words_swapped = 0
    #we also need to keep track of the amount of letters that were swapped
    letters_swapped = 0
    #we also need to keep track of the amount of words that were changed
    words_changed = 0
    #we need to keep track of the amount of sentences that were changed
    sentences_changed = 0

    for i in range(len(df)):
        #we need to get the sentence
        sentence = df.iloc[i][0]
        #we need to swap the sentence
        sentence, results = injection_swap(sentence, homophones_dict, confusing_letters_dict, p1, p2)
        #we need to update the amount of words that were swapped
        words_swapped += results[0]
        #we need to update the amount of letters that were swapped
        letters_swapped += results[1]
        #we need to update the amount of words that were changed
        words_changed += results[2]
        #we need to update the amount of sentences that were changed
        if results[2] > 0:
            sentences_changed += 1
        #we need to update the sentence in the dataframe
        df.iloc[i][0] = sentence

    print("Words swapped: " + str(words_swapped))
    print("Letters swapped: " + str(letters_swapped))
    print("Words changed: " + str(words_changed))
    print("Sentences changed: " + str(sentences_changed))
    return df, (words_swapped, letters_swapped, words_changed, sentences_changed)

In [67]:
#now lets apply our function to the wmt14 and wmt16 data and save the new data to a csv file

#lets start with p1 = 0.005 and p2 = 0.005 then we increase p1 and p2 by 0.005 until we get to p1 = 0.05 and p2 = 0.05
#also create a df to store the results of the injection swap for each iteration

df_swap_results = pd.DataFrame(columns=["dataset","p1", "p2", "words_swapped", "letters_swapped", "words_changed", "sentences_changed"])

for i in range(5, 55, 5):
    p1 = i/1000
    p2 = i/1000
    print("p1: " + str(p1))
    print("p2: " + str(p2))

    
    #also save as xlsx for amazon formating
    df_wmt14_en_temp, results = injection_swap_df(df_wmt14_en.copy(deep=True), words, confusing_letters, p1=p1, p2=0)
    df_wmt14_en_temp.to_csv("data/injected/wmt14/wmt14_english_data_injection_swap_p1_" + str(p1) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt14_en", "p1":p1, "p2":0, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt14_en_temp.to_excel("data/injected/for_amazon/wmt14_english_data_injection_swap_p1_" + str(p1) + ".xlsx", header=False, index=False)

    df_wmt14_en_temp, results = injection_swap_df(df_wmt14_en.copy(deep=True), words, confusing_letters, p1=0, p2=p2)
    df_wmt14_en_temp.to_csv("data/injected/wmt14/wmt14_english_data_injection_swap_p2_" + str(p2) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt14_en", "p1":0, "p2":p2, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt14_en_temp.to_excel("data/injected/for_amazon/wmt14_english_data_injection_swap_p2_" + str(p2) + ".xlsx", header=False, index=False)

    df_wmt14_en_temp, results = injection_swap_df(df_wmt14_en.copy(deep=True), words, confusing_letters, p1=p1, p2=p2)
    df_wmt14_en_temp.to_csv("data/injected/wmt14/wmt14_english_data_injection_swap_p1_" + str(p1) + "_p2_" + str(p2) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt14_en", "p1":p1, "p2":p2, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt14_en_temp.to_excel("data/injected/for_amazon/wmt14_english_data_injection_swap_p1_" + str(p1) + "_p2_" + str(p2) + ".xlsx", header=False, index=False)

    df_wmt16_en_temp, results = injection_swap_df(df_wmt16_en.copy(deep=True), words, confusing_letters, p1=p1, p2=0)
    df_wmt16_en_temp.to_csv("data/injected/wmt16/wmt16_english_data_injection_swap_p1_" + str(p1) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt16_en", "p1":p1, "p2":0, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt16_en_temp.to_excel("data/injected/for_amazon/wmt16_english_data_injection_swap_p1_" + str(p1) + ".xlsx", header=False, index=False)

    df_wmt16_en_temp, results = injection_swap_df(df_wmt16_en.copy(deep=True), words, confusing_letters, p1=0, p2=p2)
    df_wmt16_en_temp.to_csv("data/injected/wmt16/wmt16_english_data_injection_swap_p2_" + str(p2) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt16_en", "p1":0, "p2":p2, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt16_en_temp.to_excel("data/injected/for_amazon/wmt16_english_data_injection_swap_p2_" + str(p2) + ".xlsx", header=False, index=False)

    df_wmt16_en_temp, results = injection_swap_df(df_wmt16_en.copy(deep=True), words, confusing_letters, p1=p1, p2=p2)
    df_wmt16_en_temp.to_csv("data/injected/wmt16/wmt16_english_data_injection_swap_p1_" + str(p1) + "_p2_" + str(p2) + ".csv", header=False, index=False)
    df_swap_results.loc[len(df_swap_results)] = {"dataset":"wmt16_en", "p1":p1, "p2":p2, "words_swapped":results[0], "letters_swapped":results[1], "words_changed":results[2], "sentences_changed":results[3]}
    df_wmt16_en_temp.to_excel("data/injected/for_amazon/wmt16_english_data_injection_swap_p1_" + str(p1) + "_p2_" + str(p2) + ".xlsx", header=False, index=False)
    

    
df_swap_results.to_csv("data/injected/wmt14_wmt16_english_data_injection_swap_results.csv", index=False)




p1: 0.005
p2: 0.005
Words swapped: 77
Letters swapped: 0
Words changed: 77
Sentences changed: 74
Words swapped: 0
Letters swapped: 977
Words changed: 977
Sentences changed: 783
Words swapped: 74
Letters swapped: 1028
Words changed: 1102
Sentences changed: 883
Words swapped: 92
Letters swapped: 0
Words changed: 92
Sentences changed: 91
Words swapped: 0
Letters swapped: 955
Words changed: 955
Sentences changed: 787
Words swapped: 85
Letters swapped: 948
Words changed: 1033
Sentences changed: 819
p1: 0.01
p2: 0.01
Words swapped: 166
Letters swapped: 0
Words changed: 166
Sentences changed: 155
Words swapped: 0
Letters swapped: 2095
Words changed: 2095
Sentences changed: 1394
Words swapped: 147
Letters swapped: 2095
Words changed: 2242
Sentences changed: 1440
Words swapped: 165
Letters swapped: 0
Words changed: 165
Sentences changed: 158
Words swapped: 0
Letters swapped: 1868
Words changed: 1868
Sentences changed: 1285
Words swapped: 144
Letters swapped: 1932
Words changed: 2076
Sentences c

In [30]:
#lets test the injection_swap function on a sentence with p2 = 0.025
sentence = "I am a sentence that has some words that will be swapped with homophones and some letters that will be swapped with confusing letters."
print(sentence)

sentence, words_swapped, letters_swapped, words_changed = injection_swap(sentence, words, confusing_letters, p1=0, p2=0.025)
print(sentence)
print("Words swapped: " + str(words_swapped))
print("Letters swapped: " + str(letters_swapped))
print("Words changed: " + str(words_changed))


I am a sentence that has some words that will be swapped with homophones and some letters that will be swapped with confusing letters.
I am a sentence that has some words that vill be swapped with homophones and some letters that will be swapped with confusong letters.
Words swapped: 0
Letters swapped: 2
Words changed: 2


In [39]:
df_wmt14_en.head()

Unnamed: 0,0
0,Spectacular Wingsuit Jump Over Bogota
1,Sportsman Jhonathan Florez jumped from a helic...
2,"Wearing a wingsuit, he flew past over the famo..."
3,A black box in your car?
4,As America's road planners struggle to find th...


In [46]:
"Testing".islower()

False

In [69]:
import os
from google.cloud import translate
import pickle
import evaluate
from datasets import load_dataset
import requests, uuid, json
import pandas as pd
#need to set os environment variable with google applioation credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "your key"

  from .autonotebook import tqdm as notebook_tqdm


In [70]:
#creating our translation function from google api advanced translation v3
def translate_text(source_lang="en-US", target_lang="fr", text="YOUR_TEXT_TO_TRANSLATE", project_id="YOUR_PROJECT_ID"):
    """Translating Text."""
    #this uses google credentials from the environment variable
    client = translate.TranslationServiceClient()

    location = "global"

    parent = f"projects/{project_id}/locations/{location}"

    # Translate text from English to French
    # Detail on supported types can be found here:
    # https://cloud.google.com/translate/docs/supported-formats
    response = client.translate_text(
        request={
            "parent": parent,
            "contents": [text],
            "mime_type": "text/plain",  # mime types: text/plain, text/html
            "source_language_code": source_lang, #was "en-US"
            "target_language_code": target_lang, #was "fr"
        }
    )

    return response.translations[0].translated_text

In [82]:
import evaluate
bleu = evaluate.load("bleu")

In [83]:
#create a results for the bleu scores of each file
df_bleu_results = pd.DataFrame(columns=["dataset_name", "bleu_score"])

df_wmt14_reference = pd.read_csv('data/wmt14_french_data.csv', header=None)
df_wmt14_reference.head()
#convert to list
wmt14_reference = df_wmt14_reference[0].tolist()
wmt14_reference[0:5]

['Spectaculaire saut en "wingsuit" au-dessus de Bogota',
 "Le sportif Jhonathan Florez a sauté jeudi d'un hélicoptère au-dessus de Bogota, la capitale colombienne.",
 "Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude, où de nombreux badauds s'étaient rassemblés pour observer son exploit.",
 'Une boîte noire dans votre voiture\xa0?',
 "Alors que les planificateurs du réseau routier des États-Unis ont du mal à trouver l'argent nécessaire pour réparer l'infrastructure autoroutière en décrépitude, nombreux sont ceux qui entrevoient une solution sous forme d'une petite boîte noire qui se fixe au-dessus du tableau de bord de votre voiture."]

In [84]:
#for every csv in the "C:\Users\gerge\OneDrive\Desktop\AImpower\data\injected\wmt14" folder 
#we run it through the google api translation
#we loop through the csvs in the folder
counter = 1
for filename in os.listdir("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt14"):
    #we open the csv with pandas and convert it to a list
    df = pd.read_csv("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt14\\" + filename, header=None)
    df = df[0].tolist()
    candidate_corpus = []
    for i in range(len(df)):
        candidate_corpus.append(translate_text(target_lang="fr",text=df[i], project_id="dyslexia-translation"))

    #calculate the bleu score
    bleu_score = bleu.compute(predictions=candidate_corpus, references=wmt14_reference)
    #save the candidate corpus to pickle file
    with open("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\\google_candidate_corpus\\wmt14\\" + filename[:-4] + ".pkl", "wb") as f:
        pickle.dump(candidate_corpus, f)
    
    #add the bleu score to the results dataframe
    df_bleu_results.loc[len(df_bleu_results)] = {"dataset_name":filename, "bleu_score":bleu_score}
    #print the results and filename
    print(filename + " bleu score: " + str(bleu_score))

    #print the number of files left to process
    print(str(counter) + "/"+ str(len(os.listdir("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt14"))) + " files")
    counter += 1
    


df_bleu_results.to_csv("data/injected/wmt14_bleu_results.csv", index=False)



wmt14_english_data_injection_swap_p1_0.005.csv bleu score: {'bleu': 0.4558755588054787, 'precisions': [0.7186877163537186, 0.5224061915616768, 0.3974921005437103, 0.3056633510266589], 'brevity_penalty': 0.9864298508784737, 'length_ratio': 0.986521097974284, 'translation_length': 76264, 'reference_length': 77306}
1/30 files
wmt14_english_data_injection_swap_p1_0.005_p2_0.005.csv bleu score: {'bleu': 0.45097839852677496, 'precisions': [0.715205222657735, 0.5175354803493449, 0.39213398409152356, 0.30068672334859387], 'brevity_penalty': 0.986678931815459, 'length_ratio': 0.9867668744987452, 'translation_length': 76283, 'reference_length': 77306}
2/30 files
wmt14_english_data_injection_swap_p1_0.01.csv bleu score: {'bleu': 0.45564322922873857, 'precisions': [0.7185250265542428, 0.5221961341050563, 0.3972784080395143, 0.30547790400285496], 'brevity_penalty': 0.986364293089946, 'length_ratio': 0.9864564199415311, 'translation_length': 76259, 'reference_length': 77306}
3/30 files
wmt14_english

In [89]:
#do exactly the same for wmt16
df_bleu_results_wmt16_google = pd.DataFrame(columns=["dataset_name", "bleu_score"])

df_wmt16_reference = pd.read_csv('data/wmt16_german_data.csv', header=None)
df_wmt16_reference.head()
#convert to list
wmt16_reference = df_wmt16_reference[0].tolist()
wmt16_reference[0:5]


['Obama empfängt Netanyahu',
 'Das Verhältnis zwischen Obama und Netanyahu ist nicht gerade freundschaftlich.',
 'Die beiden wollten über die Umsetzung der internationalen Vereinbarung sowie über Teherans destabilisierende Maßnahmen im Nahen Osten sprechen.',
 'Bei der Begegnung soll es aber auch um den Konflikt mit den Palästinensern und die diskutierte Zwei-Staaten-Lösung gehen.',
 'Das Verhältnis zwischen Obama und Netanyahu ist seit Jahren gespannt.']

In [90]:
#same as above but for wmt16
counter = 1
for filename in os.listdir("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt16"):
    df = pd.read_csv("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt16\\" + filename, header=None)
    df = df[0].tolist()
    candidate_corpus = []
    for i in range(len(df)):
        candidate_corpus.append(translate_text(target_lang="de",text=df[i], project_id="dyslexia-translation"))

    bleu_score = bleu.compute(predictions=candidate_corpus, references=wmt16_reference)
    with open("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\\google_candidate_corpus\\wmt16\\" + filename[:-4] + ".pkl", "wb") as f:
        pickle.dump(candidate_corpus, f)
    df_bleu_results_wmt16_google.loc[len(df_bleu_results_wmt16_google)] = {"dataset_name":filename, "bleu_score":bleu_score}
    print(filename + " bleu score: " + str(bleu_score))
    print(str(counter) + "/"+ str(len(os.listdir("C:\\Users\\gerge\\OneDrive\\Desktop\\AImpower\\data\\injected\\wmt16"))) + " files")
    counter += 1

df_bleu_results_wmt16_google.to_csv("data/injected/wmt16_bleu_results_google.csv", index=False)

wmt16_english_data_injection_swap_p1_0.005.csv bleu score: {'bleu': 0.3957579447636291, 'precisions': [0.6990445598911814, 0.4687004481213375, 0.3366351029606877, 0.24781839849306117], 'brevity_penalty': 0.9733215702731506, 'length_ratio': 0.9736711930938742, 'translation_length': 61019, 'reference_length': 62669}
1/30 files
wmt16_english_data_injection_swap_p1_0.005_p2_0.005.csv bleu score: {'bleu': 0.39019375319766253, 'precisions': [0.6942975490757871, 0.4628099173553719, 0.33062217461556614, 0.24213800253427026], 'brevity_penalty': 0.9743040474194508, 'length_ratio': 0.9746286042540969, 'translation_length': 61079, 'reference_length': 62669}
2/30 files
wmt16_english_data_injection_swap_p1_0.01.csv bleu score: {'bleu': 0.3952790545504284, 'precisions': [0.6987681420568096, 0.46808620600547834, 0.3359795087923267, 0.2470750965362227], 'brevity_penalty': 0.9737638013122918, 'length_ratio': 0.9741020281159743, 'translation_length': 61046, 'reference_length': 62669}
3/30 files
wmt16_eng

In [87]:
#print len of german reference
print(len(wmt16_reference))


2999


In [88]:
#print len of sample candidate corpus
print(len(candidate_corpus))


2999
