In [21]:
import pandas as pd
from jiwer import wer
import numpy as np
from scipy.spatial.distance import cosine
import gensim
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download

In [22]:
def clean_translation(text):
    if isinstance(text, str):
        text = text.replace("[{'translation_text': '", "")
        text = text.replace("'}]", "")
        return text
    return text

# Apply the cleaning function


In [23]:
def get_wer(series1, series2):
    list1 = series1.tolist()
    list2 = series2.tolist()

    wer_scores = [wer(human, machine) for human, machine in zip(list1, list2)]
    return wer_scores

In [24]:
df = pd.read_csv('translations.csv')
df = df[:50]
df = df['human_translation']

opus = pd.read_csv('translations/opus_translations.csv', header=None)
t5 = pd.read_csv('translations/t5_translations.csv', header=None)

translations = pd.concat([df, opus, t5], axis=1)
header = ['human_translation', 'opus_translation', 't5_translation']
translations.columns = header

translations['opus_translation'] = translations['opus_translation'].apply(clean_translation)
translations['t5_translation'] = translations['t5_translation'].apply(clean_translation)

translations

Unnamed: 0,human_translation,opus_translation,t5_translation
0,Un tornado es una columna giratoria de aire de...,Un tornado es una columna giratoria de aire de...,Un tornado es una columna de aire muy bajo de ...
1,"El ex Presidente estadounidense de la Cámara, ...","El ex presidente estadounidense de la casa, Ne...","El antiguo orador estadounidense de la casa, N..."
2,La isla fue habitada inicialmente vez por los ...,La isla fue habitada por primera vez por los T...,La isla fue habitada por los trópicos y los Ca...
3,Estes impulsos nerviosos pueden ser enviados t...,Este nervio se puede enviar tan rápidamente po...,Este músculo se puede enviar tan rápidamente a...
4,En el 24 de septiembre de 1759 Arthur Guinness...,El 24 de septiembre de 1759 Arthur Guinness fi...,"El 24 de septiembre de 1759, Arthur Guinness f..."
5,"Hoy en día, Tombuctú es una ciudad empobrecida...","Hoy en día, Tombuctú es una ciudad empobrecida...","Hoy en día, Timbuktu es una ciudad desfavoreci..."
6,"con la misma zona horaria que Hawái, las islas...","con la misma zona horaria que Hawái, la isla o...","con la misma zona horaria que Hawaii, la isla ..."
7,La compañía eléctrica Hokuriku informó que no ...,Su tortuoso parkour eléctrico informó que no h...,Su parkour eléctrico crookie no informó de que...
8,Masa debe estar fuera por lo menos el resto de...,Masa debe estar fuera por lo menos el resto de...,Masa se espera que se mantenga al menos por el...
9,Pittman sugirió que las condiciones no mejorar...,Pittman sugirió que las condiciones no mejorar...,Pittman sugirió que las condiciones no mejorar...


In [25]:
#opus wer
human_translation = translations['human_translation']
opus_translation = translations['opus_translation']
wer_scores = get_wer(human_translation, opus_translation)
mean_wer = sum(wer_scores)/len(wer_scores)
mean_wer

0.15333000759579127

In [26]:
#t5 wer
human_translation = translations['human_translation']
t5_translation = translations['t5_translation']
wer_scores = get_wer(human_translation, t5_translation)
mean_wer = sum(wer_scores)/len(wer_scores)
mean_wer

0.4547647596981374

In [27]:
spanish_model = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_eswiki_20180420_300d", filename="eswiki_20180420_300d.txt"))

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/averyfield/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
human_text = translations['human_translation'].dropna().tolist()
opus_text = translations['opus_translation'].dropna().tolist()

# Tokenize and preprocess
def preprocess_text(text):
    # Tokenize and lowercase
    return word_tokenize(text.lower())

human_tokenized = [preprocess_text(sentence) for sentence in human_text]
opus_tokenized = [preprocess_text(sentence) for sentence in opus_text]

# Function to get embeddings for a sentence by averaging word embeddings
def get_sentence_embedding(sentence, model):
    embeddings = [model[word] for word in sentence if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Get sentence embeddings for human and opus translated text
human_embeddings = [get_sentence_embedding(sentence, spanish_model) for sentence in human_tokenized]
opus_embeddings = [get_sentence_embedding(sentence, spanish_model) for sentence in opus_tokenized]

opus_cosine_distances = []

# Calculate cosine distance between corresponding human and opus translation embeddings
for human_emb, opus_emb in zip(human_embeddings, opus_embeddings):
    if human_emb is not None and opus_emb is not None:
        dist = cosine(human_emb, opus_emb)  # Cosine distance
    else:
        dist = None  # Handle cases where embeddings couldn't be calculated
    opus_cosine_distances.append(dist)

mean_cd = sum(opus_cosine_distances)/len(opus_cosine_distances)
print(f"Mean Cosine Distance: {mean_cd}")

Mean Cosine Distance: 0.010252138527626903


In [29]:
human_text = translations['human_translation'].dropna().tolist()
t5_text = translations['t5_translation'].dropna().tolist()

# Tokenize and preprocess
def preprocess_text(text):
    # Tokenize and lowercase
    return word_tokenize(text.lower())

human_tokenized = [preprocess_text(sentence) for sentence in human_text]
t5_tokenized = [preprocess_text(sentence) for sentence in t5_text]

# Function to get embeddings for a sentence by averaging word embeddings
def get_sentence_embedding(sentence, model):
    embeddings = [model[word] for word in sentence if word in model]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Get sentence embeddings for human and t5 translated text
human_embeddings = [get_sentence_embedding(sentence, spanish_model) for sentence in human_tokenized]
t5_embeddings = [get_sentence_embedding(sentence, spanish_model) for sentence in t5_tokenized]

t5_cosine_distances = []

# Calculate cosine distance between corresponding human and t5 translation embeddings
for human_emb, t5_emb in zip(human_embeddings, t5_embeddings):
    if human_emb is not None and t5_emb is not None:
        dist = cosine(human_emb, t5_emb)  # Cosine distance
    else:
        dist = None  # Handle cases where embeddings couldn't be calculated
    t5_cosine_distances.append(dist)

mean_cd = sum(t5_cosine_distances)/len(t5_cosine_distances)
print(f"Mean Cosine Distance: {mean_cd}")

Mean Cosine Distance: 0.0363625209250241
