In [1]:
import pandas as pd
import os

In [2]:
import re
import pandas as pd
import copy
import numpy as np
import evaluate
from docx import Document
class DataLoader:
    """
    Loader for benchmarking datasets to ensure universal formatting. To be used in conjunction with DyslexiaInjector.
    ...
    Attributes
    ----------
    path: str
        Path to csv, txt or docx file of the data. In the case of CSV there should only be 1 column
    data: list
        A list of striings
    dataset_name: str
        Name of the dataset that is used when saving the data
        
    Methods
    -------
    save_as_txt(path):
        Saves data as a text file to specified path
    save_as_csv(path):
        Saves data as a csv file to specified path
    get_data():
        Returns data
    create_deepcopy():
        Returns a deep copy of the class instance
    get_name():
        returns name of the dataset (dataset_name)
    
    Usage
    -------
    >>> from datasets import load_dataset
    >>> from DataLoader import DataLoader
    >>> dataset_wmt_enfr = load_dataset("wmt14",'fr-en', split='test')
    >>> to_translate = []
    >>> for i in range(len(dataset_wmt_enfr)):
    >>>     to_translate.append(dataset_wmt_enfr[i]['translation']['en'])
    >>> loader = DataLoader(data=to_translate, dataset_name="wmt14_enfr")
    >>> loader.save_as_txt("wmt14_enfr.txt")
    We can also use the text file to create a new DataLoader instance
    >>> loader2 = DataLoader(path="wmt14_enfr.txt", dataset_name="wmt14_enfr")
    """
    # Constructor
    def __init__(self, path=None, data=None, dataset_name=""):
        self.dataset_name = dataset_name
        if data is None and path is not None:
            #check path to see if file is txt or csv
            file_type = path.split(".")[-1]
            if file_type == "txt":
                self.data = self.parse_txt(path)
                self.data = [self.fix_format(sentence) for sentence in self.data]
            elif file_type == "csv":
                self.data = pd.read_csv(path, header=None)
                self.data = self.data[0].tolist()
                #fix any formatting issues
                self.data = [self.fix_format(sentence) for sentence in self.data]
            elif file_type == "docx":
                doc = Document(path)
                self.data = [self.fix_format(paragraph.text) for paragraph in doc.paragraphs]
            else:
                raise Exception("Invalid file type")
        elif data is not None:
            #check if data is a list or a df
            if isinstance(data, list):
                #format each sentence in data
                self.data = [self.fix_format(sentence) for sentence in data]
            else:
                raise Exception("Invalid data type, please pass in a list of sentences")
        else:
            raise Exception("Please pass in a path or data")

    def parse_txt(self, path):
        output = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                output.append(self.fix_format(line))
        return output
                
    def fix_format(self, sentence):
        #remove spacing before punctuation
        sentence = re.sub(r'\s([?.!,"](?:\s|$))', r'\1', sentence)
        #replace any double spaces with single space
        sentence = re.sub(r'\s+', ' ', sentence)
        #remove any leading or trailing spaces
        sentence = sentence.strip()
        #make all quotes (german and french) english double quotes
        sentence = re.sub(r'«|»|„|“', '"', sentence)
        #make all single quotes english single quotes
        sentence = re.sub(r'‘|’', "'", sentence)
        #make all french guillemets english double quotes
        sentence = re.sub(r'‹|›', '"', sentence)
        #if sentence begins and ends with quotes and there are only two, remove them
        if sentence[0] == '"' and sentence[-1] == '"' and sentence.count('"') == 2:
            sentence = sentence[1:-1]
        elif sentence[0] == "'" and sentence[-1] == "'" and sentence.count("'") == 2:
            sentence = sentence[1:-1]
        return sentence

    def save_as_txt(self, path):
        with open(path, "w", encoding="utf-8") as f:
            for sentence in self.data:
                f.write(f"{sentence}\n")
        print(f"Saved {self.dataset_name} to {path}")
        return
    
    def save_as_csv(self, path):
        df = pd.DataFrame(self.data)
        df.to_csv(path, index=False, header=False, encoding='utf-8')
        print(f"Saved {self.dataset_name} to {path}")
        return
    
    def save_as_docx(self, path):
        document = Document()
        for sentence in self.data:
            document.add_paragraph(sentence)
        document.save(path)
        print(f"Saved {self.dataset_name} to {path}")
        return

    def get_data(self):
        return self.data

    def create_deepcopy(self):
        return DataLoader(data=copy.deepcopy(self.data), dataset_name=self.dataset_name)
        
    def get_name(self):
        return self.dataset_name

    def get_number_of_sentences(self):
        return len(self.data)
    
    def get_number_of_words(self):
        return sum([len(sentence.split()) for sentence in self.data])
    
    def get_number_of_letters(self):
        #need to ensure we only count letters and not punctuation
        return sum([len(re.sub(r'[^\w\s]','',sentence)) for sentence in self.data])
    
    def get_bleue_score(self, reference):
        #returns bleu score of the data against a reference
        bleu = evaluate.load("bleu")
        if type(reference) == list:
            return bleu.compute(predictions=self.data, references=reference)
        elif type(reference) == DataLoader:
            return bleu.compute(predictions=self.data, references=reference.get_data())
        else:
            raise Exception("Invalid reference type, please pass in a list or DataLoader instance")

    def get_wer(self, reference):
        #returns wer score of the data against a reference
        wer = evaluate.load("wer")
        if type(reference) == list:
            return wer.compute(predictions=self.data, references=reference)
        elif type(reference) == DataLoader:
            return wer.compute(predictions=self.data, references=reference.get_data())
        else:
            raise Exception("Invalid reference type, please pass in a list or DataLoader instance")

    def get_bleurt_score(self, reference):
        #returns bleurt score of the data against a reference
        bleurt = evaluate.load("bleurt")
        if type(reference) == list:
            return bleurt.compute(predictions=self.data, references=reference)
        elif type(reference) == DataLoader:
            return bleurt.compute(predictions=self.data, references=reference.get_data())
        else:
            raise Exception("Invalid reference type, please pass in a list or DataLoader instance")
            

In [3]:
from datasets import load_dataset
dataset_wmt_enfr = load_dataset("wmt14",'fr-en', split='test')
to_translate_wmt14_en = []
reference_wmt14_fr = []

for i in range(len(dataset_wmt_enfr)):
    to_translate_wmt14_en.append(dataset_wmt_enfr[i]['translation']['en'])
    reference_wmt14_fr.append(dataset_wmt_enfr[i]['translation']['fr'])

reference_corpus_fr = DataLoader(data=reference_wmt14_fr, dataset_name="wmt14_fr")
reference_corpus_en = DataLoader(data=to_translate_wmt14_en, dataset_name="wmt14_en")

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
aws_data_v1 = []
for filename in os.listdir("to_test/v1/aws"):
    temp = DataLoader(path="to_test/v1/aws/"+filename, dataset_name="aws_"+filename)
    aws_data_v1.append(temp)

aws_data_v2 = []
for filename in os.listdir("to_test/v2/aws"):
    temp = DataLoader(path="to_test/v2/aws/"+filename, dataset_name="aws_"+filename)
    aws_data_v2.append(temp)

azure_data_v1 = []
for filename in os.listdir("to_test/v1/azure"):
    temp = DataLoader(path="to_test/v1/azure/"+filename, dataset_name="azure_"+filename)
    azure_data_v1.append(temp)

azure_data_v2 = []
for filename in os.listdir("to_test/v2/azure"):
    temp = DataLoader(path="to_test/v2/azure/"+filename, dataset_name="azure_"+filename)
    azure_data_v2.append(temp)

google_data_v1 = []
for filename in os.listdir("to_test/v1/google"):
    temp = DataLoader(path="to_test/v1/google/"+filename, dataset_name="google_"+filename)
    google_data_v1.append(temp)

google_data_v2 = []
for filename in os.listdir("to_test/v2/google"):
    temp = DataLoader(path="to_test/v2/google/"+filename, dataset_name="google_"+filename)
    google_data_v2.append(temp)

gpt_data_v1 = []
for filename in os.listdir("to_test/v1/gpt"):
    temp = DataLoader(path="to_test/v1/gpt/"+filename, dataset_name="gpt_"+filename)
    gpt_data_v1.append(temp)

gpt_data_v2 = []
for filename in os.listdir("to_test/v2/gpt"):
    temp = DataLoader(path="to_test/v2/gpt/"+filename, dataset_name="gpt_"+filename)
    gpt_data_v2.append(temp)


In [19]:
import spacy
nlp = spacy.load('fr_core_news_lg')

In [20]:
baseline_aws = aws_data_v1[0]
baseline_azure = azure_data_v1[0]
baseline_google = google_data_v1[0]
baseline_gpt = gpt_data_v1[0]

In [21]:
print(f"Baseline AWS Sentence 1: {baseline_aws.data[0]}")
print(f"Baseline Azure Sentence 1: {baseline_azure.data[0]}")
print(f"Baseline Google Sentence 1: {baseline_google.data[0]}")
print(f"Baseline GPT Sentence 1: {baseline_gpt.data[0]}")
print(f"Reference Sentence 1: {reference_corpus_fr.data[0]}")

Baseline AWS Sentence 1: Saut spectaculaire en wingsuit au-dessus de Bogota
Baseline Azure Sentence 1: Saut spectaculaire en wingsuit au-dessus de Bogota
Baseline Google Sentence 1: Saut spectaculaire en wingsuit au-dessus de Bogota
Baseline GPT Sentence 1: Saut en wingsuit spectaculaire au-dessus de Bogota
Reference Sentence 1: Spectaculaire saut en "wingsuit" au-dessus de Bogota


In [25]:
#get similarity scores for each sentence compared to the reference
aws_similarity = []
azure_similarity = []
google_similarity = []
gpt_similarity = []

for i in range(len(aws_data_v1[0].data)):
    ref = nlp(reference_corpus_fr.data[i])
    aws = nlp(aws_data_v1[0].data[i])
    aws_similarity.append(aws.similarity(ref))

    azure = nlp(azure_data_v1[0].data[i])
    azure_similarity.append(azure.similarity(ref))

    google = nlp(google_data_v1[0].data[i])
    google_similarity.append(google.similarity(ref))

    gpt = nlp(gpt_data_v1[0].data[i])
    gpt_similarity.append(gpt.similarity(ref))


In [26]:
import numpy as np

In [27]:
#lets get the average similarity score for each translation service
aws_similarity = np.array(aws_similarity)
azure_similarity = np.array(azure_similarity)
google_similarity = np.array(google_similarity)
gpt_similarity = np.array(gpt_similarity)

print(f"Average similarity score for AWS: {aws_similarity.mean()}")
print(f"Average similarity score for Azure: {azure_similarity.mean()}")
print(f"Average similarity score for Google: {google_similarity.mean()}")
print(f"Average similarity score for GPT: {gpt_similarity.mean()}")


Average similarity score for AWS: 0.9228869437498419
Average similarity score for Azure: 0.9237606656338498
Average similarity score for Google: 0.9255936066565403
Average similarity score for GPT: 0.9163111094771934


In [28]:
#lets find the sentence with lowest similarity score for each translation service
aws_lowest = aws_similarity.argmin()
azure_lowest = azure_similarity.argmin()
google_lowest = google_similarity.argmin()
gpt_lowest = gpt_similarity.argmin()

print(f"Lowest similarity score for AWS: {aws_similarity[aws_lowest]}")
print(f"The sentence is: {aws_data_v1[0].data[aws_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[aws_lowest]}")

print('--------------------------------------')

print(f"Lowest similarity score for Azure: {azure_similarity[azure_lowest]}")
print(f"The sentence is: {azure_data_v1[0].data[azure_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[azure_lowest]}")

print('--------------------------------------')
print(f"Lowest similarity score for Google: {google_similarity[google_lowest]}")
print(f"The sentence is: {google_data_v1[0].data[google_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[google_lowest]}")

print('--------------------------------------')
print(f"Lowest similarity score for GPT: {gpt_similarity[gpt_lowest]}")
print(f"The sentence is: {gpt_data_v1[0].data[gpt_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[gpt_lowest]}")




Lowest similarity score for AWS: 0.11297058356217592
The sentence is: Les comédies musicales, les grandes performances
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for Azure: 0.11297058356217592
The sentence is: Les comédies musicales, les grandes performances
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for Google: 0.15041544678238367
The sentence is: Les comédies musicales, les grands spectacles
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for GPT: 0.2139059927568614
The sentence is: Style documentaire
The reference is: Un air de documentaire


In [29]:
print(f"GPT similarity where other services fail: {gpt_similarity[aws_lowest]}")
print(f"GPT Sentence: {gpt_data_v1[0].data[aws_lowest]}")
print(f"Reference Sentence: {reference_corpus_fr.data[aws_lowest]}")


GPT similarity where other services fail: 0.21757888566692005
GPT Sentence: Comédies musicales, les grandes performances
Reference Sentence: La comédie musicale, cette bête de scène


In [30]:
confusing_word_35_aws = aws_data_v2[1]
confusing_word_35_azure = azure_data_v2[1]
confusing_word_35_google = google_data_v2[1]
confusing_word_35_gpt = gpt_data_v2[2]

In [31]:
#lets get the average similarity score for each translation service
aws_wordconfusion_35_similarity = []
azure_wordconfusion_35_similarity = []
google_wordconfusion_35_similarity = []
gpt_wordconfusion_35_similarity = []

for i in range(len(confusing_word_35_aws.data)):
    ref = nlp(reference_corpus_fr.data[i])
    aws = nlp(confusing_word_35_aws.data[i])
    aws_wordconfusion_35_similarity.append(aws.similarity(ref))

    azure = nlp(confusing_word_35_azure.data[i])
    azure_wordconfusion_35_similarity.append(azure.similarity(ref))

    google = nlp(confusing_word_35_google.data[i])
    google_wordconfusion_35_similarity.append(google.similarity(ref))

    gpt = nlp(confusing_word_35_gpt.data[i])
    gpt_wordconfusion_35_similarity.append(gpt.similarity(ref))



In [32]:
#get the average similarity score for each translation service
aws_wordconfusion_35_similarity = np.array(aws_wordconfusion_35_similarity)
azure_wordconfusion_35_similarity = np.array(azure_wordconfusion_35_similarity)
google_wordconfusion_35_similarity = np.array(google_wordconfusion_35_similarity)
gpt_wordconfusion_35_similarity = np.array(gpt_wordconfusion_35_similarity)

print(f"Average similarity score for AWS: {aws_wordconfusion_35_similarity.mean()}")
print(f"Average similarity score for Azure: {azure_wordconfusion_35_similarity.mean()}")
print(f"Average similarity score for Google: {google_wordconfusion_35_similarity.mean()}")
print(f"Average similarity score for GPT: {gpt_wordconfusion_35_similarity.mean()}")


Average similarity score for AWS: 0.8934521507860191
Average similarity score for Azure: 0.8795947019955627
Average similarity score for Google: 0.8868173562561964
Average similarity score for GPT: 0.8977187407796485


In [33]:
#lets find the sentence with lowest similarity score for each translation service
aws_wordconfusion_35_lowest = aws_wordconfusion_35_similarity.argmin()
azure_wordconfusion_35_lowest = azure_wordconfusion_35_similarity.argmin()
google_wordconfusion_35_lowest = google_wordconfusion_35_similarity.argmin()
gpt_wordconfusion_35_lowest = gpt_wordconfusion_35_similarity.argmin()

print(f"Lowest similarity score for AWS: {aws_wordconfusion_35_similarity[aws_wordconfusion_35_lowest]}")
print(f"The sentence is: {confusing_word_35_aws.data[aws_wordconfusion_35_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[aws_wordconfusion_35_lowest]}")

print('--------------------------------------')

print(f"Lowest similarity score for Azure: {azure_wordconfusion_35_similarity[azure_wordconfusion_35_lowest]}")
print(f"The sentence is: {confusing_word_35_azure.data[azure_wordconfusion_35_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[azure_wordconfusion_35_lowest]}")

print('--------------------------------------')

print(f"Lowest similarity score for Google: {google_wordconfusion_35_similarity[google_wordconfusion_35_lowest]}")
print(f"The sentence is: {confusing_word_35_google.data[google_wordconfusion_35_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[google_wordconfusion_35_lowest]}")

print('--------------------------------------')

print(f"Lowest similarity score for GPT: {gpt_wordconfusion_35_similarity[gpt_wordconfusion_35_lowest]}")
print(f"The sentence is: {confusing_word_35_gpt.data[gpt_wordconfusion_35_lowest]}")
print(f"The reference is: {reference_corpus_fr.data[gpt_wordconfusion_35_lowest]}")



Lowest similarity score for AWS: 0.11297058356217592
The sentence is: Les comédies musicales, les grandes performances
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for Azure: 0.11297058356217592
The sentence is: Les comédies musicales, les grandes performances
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for Google: 0.15041544678238367
The sentence is: Les comédies musicales, les grands spectacles
The reference is: La comédie musicale, cette bête de scène
--------------------------------------
Lowest similarity score for GPT: -0.08243895696964308
The sentence is: I'm sorry, but the sentence you provided is not grammatically correct in English. Could you please rephrase it?
The reference is: Ils se sont présentés envers et contre tous, mais les deux ont raflé la coupe en or et la rosette.


In [39]:
#lets find the top 5 worst performing sentences for gpt
gpt_wordconfusion_35_worst = gpt_wordconfusion_35_similarity.argsort()[:5]
for i in range(5):
    print(f"Lowest similarity score for GPT: {gpt_wordconfusion_35_similarity[gpt_wordconfusion_35_worst[i]]}")
    print(f"The sentence is: {confusing_word_35_gpt.data[gpt_wordconfusion_35_worst[i]]}")
    print(f"The reference is: {reference_corpus_fr.data[gpt_wordconfusion_35_worst[i]]}")
    print('--------------------------------------')


Lowest similarity score for GPT: -0.08243895696964308
The sentence is: I'm sorry, but the sentence you provided is not grammatically correct in English. Could you please rephrase it?
The reference is: Ils se sont présentés envers et contre tous, mais les deux ont raflé la coupe en or et la rosette.
--------------------------------------
Lowest similarity score for GPT: 0.16735218107989086
The sentence is: Il y avait une fontaine vendredi après-midi.
The reference is: Ils ont été localisés ce vendredi-après-midi.
--------------------------------------
Lowest similarity score for GPT: 0.2109677262970048
The sentence is: Les liens du bateau étaient toujours indiscrets.
The reference is: Mais ce n'était jamais indiscret.
--------------------------------------
Lowest similarity score for GPT: 0.2139059927568614
The sentence is: Style documentaire
The reference is: Un air de documentaire
--------------------------------------
Lowest similarity score for GPT: 0.21757888566692005
The sentence 

In [34]:
print(f"GPT similarity where aws fails: {gpt_wordconfusion_35_similarity[aws_wordconfusion_35_lowest]}")
print(f"GPT Sentence: {gpt_data_v2[2].data[aws_wordconfusion_35_lowest]}")
print(f"Reference Sentence: {reference_corpus_fr.data[aws_wordconfusion_35_lowest]}")

print('----------------------------------')

print(f"GPT similarity where azure fails: {gpt_wordconfusion_35_similarity[azure_wordconfusion_35_lowest]}")
print(f"GPT Sentence: {gpt_data_v2[2].data[azure_wordconfusion_35_lowest]}")
print(f"Reference Sentence: {reference_corpus_fr.data[azure_wordconfusion_35_lowest]}")

print('----------------------------------')

print(f"GPT similarity where google fails: {gpt_wordconfusion_35_similarity[google_wordconfusion_35_lowest]}")
print(f"GPT Sentence: {gpt_data_v2[2].data[google_wordconfusion_35_lowest]}")
print(f"Reference Sentence: {reference_corpus_fr.data[google_wordconfusion_35_lowest]}")



GPT similarity where aws fails: 0.21757888566692005
GPT Sentence: Comédies musicales, les grandes performances
Reference Sentence: La comédie musicale, cette bête de scène
----------------------------------
GPT similarity where azure fails: 0.21757888566692005
GPT Sentence: Comédies musicales, les grandes performances
Reference Sentence: La comédie musicale, cette bête de scène
----------------------------------
GPT similarity where google fails: 0.21757888566692005
GPT Sentence: Comédies musicales, les grandes performances
Reference Sentence: La comédie musicale, cette bête de scène


In [35]:
#find all sentences in gpt_data_v2[2] where it contains I'm sorry
sorry_sentences = []
for i in range(len(gpt_data_v2[2].data)):
    if "I'm sorry" in gpt_data_v2[2].data[i]:
        sorry_sentences.append(i)

print(f"Number of sentences containing I'm sorry: {len(sorry_sentences)}")

Number of sentences containing I'm sorry: 1


In [37]:
#print all sentences in gpt_data_v2[2]
for i in range(len(gpt_data_v2[2].data)):
    print(gpt_data_v2[2].data[i])


Saut en wingsuit spectaculaire au-dessus de Bogota
Le sportif Jhonathan Florez a sauté d'un hélicoptère au-dessus de Bogota, la capitale de la Colombie, jeudi.
Portant une combinaison ailée pour les yeux, il a survolé en parachute au-dessus du célèbre sanctuaire de Monserrate à une vitesse de 160 km/h. Le sanctuaire est situé à une altitude de plus de 3000 mètres et de nombreux spectateurs s'étaient rassemblés pour regarder son exploit.
Boîte noire des yeux dans votre voiture?
Alors que les planificateurs américains des transports luttent pour trouver les fonds nécessaires pour entretenir un réseau routier en déclin, beaucoup commencent à voir une solution dans une petite boîte noire qui s'installe discrètement sur le tableau de bord de votre voiture.
Les dispositifs, qui suivent chaque conducteur masculin et transmettent ces informations aux bureaucrates, sont au centre d'une tentative controversée dans les bureaux de planification de Washington et de l'État pour réformer le système o