In [1]:
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import librosa
import random
import os

In [None]:
common_voice_test = load_dataset("common_voice", "hu", split="test[1000:1101]")

In [None]:
def speech_file_to_array_fn(batch):
    mp3_path = os.path.join(os.getcwd(), "dataset/cv-corpus-8.0-2022-01-19/hu/clips/", batch["path"])
    speech_array, sampling_rate = librosa.load(mp3_path, sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

test_dataset = common_voice_test.map(speech_file_to_array_fn)

In [4]:
MODEL_ID_IT = "gchhablani/wav2vec2-large-xlsr-it"
MODEL_ID_FI = "aapot/wav2vec2-xlsr-1b-finnish-v2"
MODEL_ID_CS = "sammy786/wav2vec2-xlsr-czech"

model_ids = [MODEL_ID_IT, MODEL_ID_FI, MODEL_ID_CS]

In [5]:
import gc
from tqdm import tqdm

In [None]:
DEVICE = "cuda"

for model_id in model_ids:
    predicted_sentences = []

    processor = Wav2Vec2Processor.from_pretrained(model_id)
    model = Wav2Vec2ForCTC.from_pretrained(model_id)
    model.to(DEVICE)

    for i in tqdm(range(int(len(test_dataset["speech"]) / 5))):
        batch = test_dataset["speech"][i*5:(i+1)*5]

        inputs = processor(batch, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs.to(DEVICE)

        with torch.no_grad():
            logits = model(inputs.input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        predicted_sentences += processor.batch_decode(predicted_ids)

    gc.collect()
    torch.cuda.empty_cache()

    if model_id == MODEL_ID_IT:
        predicted_sentences_it = predicted_sentences
    if model_id == MODEL_ID_FI:
        predicted_sentences_fi = predicted_sentences
    if model_id == MODEL_ID_CS:
        predicted_sentences_cs = predicted_sentences

In [8]:
it_to_hu_dict = {
                 'sho' : 'so',
                 'ot' : 'olt',

                 "i l" : "ill",
                 "s e" : "sze",
                 "la v" : "levél",
                 "las" : "l s",
                 "t a g" : "te g",
                 " ave" : "neve",
                 "ra s" : "res",
                 "at at" : "etet",
                 "ba na" : "báná",
                 " pal" : "pál",
                 "ida " : "ide ",
                 "nas" : "nás",
                 "na" : "ne",
                 "mar" : "már",
                 "ma" : "me",
                 "lan" : "len",
                 "lal" : "lől",
                 "lad" : "lód",
                 "lat" : "lát",
                 "har" : "hár",
                 "hall" : "holl",
                 "fal" : "fel",
                 "fas" : "fés",
                 "rag" : "rág",
                 "ran" : "ren",
                 "tan " : "tán ",
                 "kat" : "ket",
                 "gat" : "get",
                 "al" : "ál",
                 " co" : " a ko",
                 "nci" : "n ki",
                 "eci" : "e és i",
                 "ece" : "ese",
                 "ice" : "ise",
                 "tad " : "tat ",
                 "edi" : "egyi",
                 "ke" : "ké",
                 "fe" : "fé",
                 "te" : "té",
                 "of" : "og",
                 " is" : " és",
                 " ti" : " tü",
                 "ti" : "té",
                 "mi" : "mű",
                 "ni" : "ny",
                 "di" : "dí",
                 "ria" : "rja",
                 "vi" : "vé",
                 "bí" : "by",
                 "on " : "om",
                 "tan" : "tam",
                 "elv" : "enj v",
                 " mo" : " bo",
                 "ami" : "a vi",
                 " to" : " tú",
                 "lo w" : "lú",
                 " om" : "am",
                 "lo" : "la",
                 "bo" : "ba",
                 "to" : "ta",
                 "no" : "na",
                 "mo" : "ma",
                 "ho" : "ha",
                 "do" : "da",
                 "jo" : "ja",
                 "vo" : "va",
                 "tò" : "ta",
                 "nò" : "na",
                 " pe" : " be",
                 "pt" : "bt",
                 "ars" : "a es",
                 "pr " : "pe",
                 "or" : "ol",
                 "ase" : "az e",
                 "at " : "ad ",
                 "ata" : "ada",
                 "u y" : "ol",
                 "nut" : "nőt",
                 "nu" : "yű ",
                 " u" : " ü",
                 "sun" : "szűn",
                 "ku" : "kö",
                 "tu" : "tó",
                 "yu" : "yó",
                 "hu" : "hó",
                 "svi" : "s i",
                 "ava" : "aba",
                 "two" : "t vo",
                 "twa" : "tóa",
                 "axa" : "a ka",
                 "ey" : "ei",
                 "by" : "bi",
                 'sh ' : 's ',
                 #'e' : 'é',
                 #'ar' : 'á',
                 'ia' : 'e',
                 #' a' : 'e',
                 #'c' : 's',
                 #'p' : 'b',
                 #'n' : 'm',
                 " ge" : " le",
                 "qué" : "ke",
                "x" : "ks",
                "q" : "k",
                " è " : " e",

                "sse" : "s se",
                "ss" : "s",
                "vv" : "v",
                "vige" : "vide",

                "gn" : "ny",
                "zs" : "s",
                "ghi" : "gi",
                "ge" : "dzse",
                "ghe" : "ge",
                "chia" : "tya",
                "cha" : "sa",
                "san " : "sam ",
                "chi" : "ki",
                "che " : "s ",
                "che" : "ke",

                "cr" : "kr",
                "ca" : "ka",
                "co" : "ko",
                "ci" : "cs",
                "sice" : "sk",

                "ce" : "s",
                "cu" : "ku",
                "sz" : "zs",
                " z" : " dz",

                "ra" : "rá",
                " gr" : " l",

                "mo" : "mú",
                "úré" : "úlé",
                "úre" : "úlé",
                "aro" : "ajlo",
                "edi" : "egy",
                "khe" : "ker",
                "sui" : "ső",
                "ck" : "t",
                "ti " : "t ",
                " y" : " j",
                "st" : "zt",
                "sh " : "s ",
                " e" : " é",
                " ov " : " ov",
                " cso " : "cso ",
                " apa" : "apa",

                "sp" : "scap",
                " ká" : " ha",
                " me" : " nem",

                " ke" : " ki",
                "ey" : "é",
                "ai m" : "ány ",
                "eno" : "ána",
                "lz" : "lc",
                "dop" : "dob",
                "é sch" : "és",
                "sel" : "szél",
                "li" : "lé",
                "de " : "d ",
                "viz" : "vis",
                "n'" : "nt",
                "erel" : "erül",
                "ig" : "ég",
                " of " : " ov ",
                "bi" : "bő",
                "be" : "bi",
                " oz" : " az",
                " the" : " de",
                "ria" : "re",
                " is " : " és ",
                "bo" : "ba",
                " ob" : " ab",
                " ka" : " ha",
                "qel" : "or",
                "koni" : "vagy",
                " la" : " ra",
                "raika" : "rajta",

                "egya " : "egya",
                "aan" : "aan ",
                " ker " : " ker",
                "rső " : "rső",
                "ò s " : "òs ",
                "ò" : "a",
                "dri " : "dri",
                " dov" : "do v",
                " v " : " v",
                " ve " : " ve",
                " di " : "di ",
                " dod " : "dod ",
                " an "  : "an",
                "zto " : "zto",
                "sanok" : "san ok",
                "kitn" : "kit n",
                " né " : " né",
                " lon " : "lon ",
                "baba" : " baba ",
                "ugi " : "ugi",
                " rel" : "rel",
                 }

fi_to_hu_dict = {" e " : " egy ",

                 "sii" : "zé",
                 "iis": "íz ",
                 "tää " : "tem",
                 "aan" : "eg",

                 "obro" : "ogo",
                 "lui" : "gy",
                 "sat" : "set",
                 "a m" : "em",
                 "esa" : "ese",
                 "oma" : "omo",
                 "sal" : "sál",
                 " j" : " ly",
                 "ik " : "ig ",
                 "aja" : "agya",
                 "eje" : "egye",
                 "iki" : "iti",
                 "eli" : "edi",
                 "to " : "ta ",
                 "t o" : "t a",
                 "a o" : "a a",
                 #" t" : " d",
                 "kt" : "k v",
                 "otk" : "ogy k",
                 "lys" : "lyő",
                 "un " :  "om ",
                 "aus" : "ajóz",
                 "ele" : "ere",
                 "uji" : "ogyi",
                 "uju" : "olyo",
                 "ä t" : "alt",
                 'shi' : 'si',
                 'vuo' : 'vo',
                 "kon" : "kön",
                 "ja " : "je ",

                 'ää' : 'á',
                 "aa" : "e",
                 "oi" : "a e",
                 'uu' : 'ó ',
                 'ii' : 'é',
                 "oo" : "ó",
                 "ee" : "é",
                 "ää" : "á",
                 "yy" : "ő",
                 "nn" : "nd",

                 'ä ' : 'e',
                 ' s' : ' cs',
                 'e ' : 'egy ',
                 ' m' : ' am',
                 't ' : 'tt ',
                 'sh ' : 's ',
                 'a ' : 'ak ',
                 'e ': 'ett ',
                 'i ' : 'is ',
                 "mi " : "mű ",
                 "io " : "ia ",
                 "ot " : "od",

                 'hi' : 'é',
                 'sa' : 'sza',
                 'lm' : 'lam',
                 'ot' : 'olt',
                 #'j' : 'gy',

                 "äl" : "el",
                 "lä" : "le",
                 "tä" : "te",
                 "jä" : "já",
                 "ät" : "alt",
                 "mä" : "me",
                 "nä" : "ne",
                 "kä" : "ke",
                 "än" : "en",
                 "hä" : "he",
                 "sä" : "se",
                 "dä" : "de",
                 "fä" : "fe",
                 "bä" : "be",
                 "ah" : "oh",
                 "la" : "lá",
                 "ma" : "má",
                 "ra" : "rá",
                 "as" : "es",
                 "al" : "el",
                 "ta" : "tá",
                 "ke" : "kö",
                 "te" : "tá",
                 "me" : "mé",
                 "ve" : "vö",
                 #" d" : "t",
                 "se" : "sé",
                 "hi" : "hí",
                 "il" : "él",
                 "li" : "lé",
                 "ki" : "ké",
                 "ik" : "ék",
                 "ti" : "té",
                 "si" : "sí",
                 "vi" : "ve",
                 "ri" : "ré",
                 "ai" : "aj",
                 "ig" : "ég",
                 "gi" : "gé",
                 "fi" : "fé",
                 "ek" : "eg",
                 "n " : "m",
                 "no" : "na",
                 "ko" : "ka",
                 "mo" : "ma",
                 "bo" : "ba",
                 "ob" : "öb",
                 "ol" : "al",
                 "lo" : "la",
                 "os" : "oz",
                 "as" : "az",
                 "is" : "iz",
                 "ts" : "cs",
                 "nt" : "nd",
                 "ut" : "ud",
                 "tu" : "to",
                 "lu" : "lo",
                 "iu" : "ió",
                 "ku" : "ko",
                 "ru" : "ro",
                 "ly" : "lő",
                 "ky" : "kö",
                 "sy" : "szű",
                 "yi " : "ő",
                 "vä" : "be",
                 "ah" : "á",
                 "ha" : "á",

                 }

cz_to_hu_dict = {"mň" : "meň",
                 "č" : "cs",
                 "ď'" : "gy",
                 "ě" : "é",
                 "ň" : "ny",
                 "š" : "s",
                 "ř" : "cs",
                 "ů" : "ú",
                 "ý" : "i",
                 "ž" : "s",
                 "a c" : "ák",
                 "rla" : "rlá",
                 "ice" : "ike",
                 "aca" : "ajla",
                 "to b" : "több",
                 "vo n" : "van",
                 "avi" : "aki",
                 "gvr" : "gér",
                 " c" : " k",
                 " co" : " so",
                 " js" : " cs",
                 "ic " : "ik ",
                 "d " : "t ",
                 "e " : "a ",
                 "j " : "i ",
                 " j" : "gy",
                 "f " : "k ",
                 " w" : " v",
                 "ky" : "k i",
                 "y " : "ig ",
                 "tw" : "t v",
                 " r" : " f",
                 " s" : " z",
                 "ar" : "ár",
                 "dá" : "da",
                 "ma" : "me",
                 "al" : "el",
                 "la" : "le",
                 "na" : "ne",
                 "pa" : "pá",
                 "sa" : "se",
                 "ka" : "ke",
                 "ta" : "te",
                 "ha" : "he",
                 "ra" : "re",
                 "ga" : "ge",
                 "ic" : "ik",
                 "ch" : "gh",
                 "mc" : "mk",
                 "nc" : "nk",
                 "uc " : "udsz ",
                 "ld" : "lt",
                 "nd" : "nt",
                 "ke" : "cé",
                 "ce" : "cá",
                 "ni" : "né",
                 "ei" : "é",
                 "lh " : "lt ",
                 "vi" : "vé",
                 "il" : "él",
                 "li" : "lé",
                 "ti" : "té",
                 "mi" : "mé",
                 "ei" : "eí",
                 "ji" : "é",
                 "ok" : "ók",
                 "jo" : "ja",
                 "vo" : "va",
                 "ov" : "av",
                 "lo" : "la",
                 "to" : "ta",
                 "bo" : "ba",
                 "do" : "dó",
                 "ho" : "ha",
                 "no" : "na",
                 "go" : "go",
                 "em" : "en",
                 "om" : "on",
                 "lu" : "lő",
                 "du" : "dó",
                 "ru" : "ro",
                 "yu" : "yó",
                 "ny" : "ni",
                 "os" : "oz",
                 "su" : "ső",
                 "tu" : "ú",
                 "ku" : "ko",
                 "nu" : "nő",
                 'o ' : 'a',
                 'a ' : 'e',
                 'l ' : 'lt ',
                 'ď' : 'gy',
                 'ot' : 'olt',
                 'nm' : 'nem',
                 #'c' : 'k',
                 'č' : 'cs',
                 'ň' : 'ny',
                 'se' : 'sze',
                 'š' : 's',
                 #'j' : 'i',
                 #'w' : 'v',
                 }

In [9]:
it_spellcheck = []
fi_spellcheck = []
cz_spellcheck = []

In [10]:
for i, predicted_sentence in enumerate(zip(predicted_sentences_it, predicted_sentences_fi, predicted_sentences_cs)):
    nsent_it = " " + predicted_sentence[0] + " "
    for ind_it, k_it in enumerate(it_to_hu_dict.keys()):
      if k_it in (" " + predicted_sentence[0] + " "): #avoids changes being made from changes
        nsent_it = nsent_it.replace(k_it, it_to_hu_dict[k_it])
        #print(f"Change{ind_it}: {nsent_it} --- {k_it} -> {it_to_hu_dict[k_it]}")

    nsent_fi = " " + predicted_sentence[1] + " "
    for ind_fi, k_fi in enumerate(fi_to_hu_dict.keys()):
      if k_fi in (" " + predicted_sentence[1] + " "):
        nsent_fi = nsent_fi.replace(k_fi, fi_to_hu_dict[k_fi])
        #print(f"Change{ind_fi}: {nsent_fi} --- {k_fi} -> {fi_to_hu_dict[k_fi]}")

    nsent_cz = " " + predicted_sentence[2] + " "
    for ind_cz, k_cz in enumerate(cz_to_hu_dict.keys()):
      if k_cz in (" " + predicted_sentence[2] + " "):
        nsent_cz = nsent_cz.replace(k_cz, cz_to_hu_dict[k_cz])
        #print(f"Change{ind_cz}: {nsent_cz} --- {k_cz} -> {cz_to_hu_dict[k_cz]}")

    it_spellcheck.append(nsent_it)
    fi_spellcheck.append(nsent_fi)
    cz_spellcheck.append(nsent_cz)

    # print("-" * 100)
    # print("Reference:", test_dataset[i]["sentence"].lower())
    # print("Italian Prediction1:", predicted_sentence[0])
    # print("Italian Prediction2:", nsent_it)
    # print("Finnish Prediction1:", predicted_sentence[1])
    # print("Finnish Prediction2:", nsent_fi)
    # print("Czech Prediction1:", predicted_sentence[2])
    # print("Czech Prediction2:", nsent_cz)
    # print("-" * 100)

In [59]:
import numpy as np

predicted_sentences_fi_list = np.empty(len(predicted_sentences_fi)).tolist()
predicted_sentences_it_list = np.empty(len(predicted_sentences_it)).tolist()
predicted_sentences_cs_list = np.empty(len(predicted_sentences_cs)).tolist()

for i in range(len(predicted_sentences_it)):
  predicted_sentences_fi_list[i] = predicted_sentences_fi[i].lower().split()
  predicted_sentences_it_list[i] = predicted_sentences_it[i].lower().split()
  predicted_sentences_cs_list[i] = predicted_sentences_cs[i].lower().split()

In [55]:
from itertools import islice

import pkg_resources
from symspellpy import SymSpell, Verbosity

In [56]:
sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=8, count_threshold=10)
dictionary_path = "lm/hu_50k.txt"
bigram_path = "lm/final_bigrams1.txt"

In [None]:
sym_spell.load_dictionary(dictionary_path, 0, 1)
sym_spell.load_bigram_dictionary(bigram_path, 0, 2, encoding="utf8")

In [69]:
import itertools

lanugage_model_input_fi = []

for sentence in predicted_sentences_fi_list:
  spell_checked_list_fi = [[] for _ in range(len(sentence))]

  for no, word in enumerate(sentence):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3, include_unknown=True)
    for suggestion in suggestions:
      spell_checked_list_fi[no].append(suggestion.term)

  all_combinations_fi = list(itertools.product(*spell_checked_list_fi))

  spell_checked_sentences_fi = []

  for combination in all_combinations_fi:
    spell_checked_sentences_fi.append(' '.join(word for word in combination))

  lanugage_model_input_fi.append(spell_checked_sentences_fi)

In [70]:
lanugage_model_input_it = []

for sentence in predicted_sentences_it_list:
  spell_checked_list_it = [[] for _ in range(len(sentence))]

  for no, word in enumerate(sentence):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3, include_unknown=True)
    for suggestion in suggestions:
      spell_checked_list_it[no].append(suggestion.term)

  all_combinations_it = list(itertools.product(*spell_checked_list_it))

  spell_checked_sentences_it = []

  for combination in all_combinations_it:
    spell_checked_sentences_it.append(' '.join(word for word in combination))

  lanugage_model_input_it.append(spell_checked_sentences_it)

In [76]:
lanugage_model_input_cs = []

In [None]:
for sentence in predicted_sentences_cs_list:
  spell_checked_list_cs = [[] for _ in range(len(sentence))]

  for no, word in enumerate(sentence):
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)
    for suggestion in suggestions:
      spell_checked_list_cs[no].append(suggestion.term)

  all_combinations_cs = list(itertools.product(*spell_checked_list_cs))

  spell_checked_sentences_cs = []

  for combination in all_combinations_cs:
    spell_checked_sentences_cs.append(' '.join(word for word in combination))

  lanugage_model_input_cs.append(spell_checked_sentences_cs)

In [64]:
import kenlm

model = kenlm.LanguageModel("lm/hu_5gram_lm.bin")

In [84]:
final_transcript_fi = []

for permuted_sentences in lanugage_model_input_fi:
  scores_fi = []

  for spell_checked_sentence in permuted_sentences:
    scores_fi.append(model.score(spell_checked_sentence))

  best_sentence_index_fi = np.argmax(scores_fi)

  final_transcript_fi.append(permuted_sentences[best_sentence_index_fi])

final_transcript_it = []

for permuted_sentences in lanugage_model_input_it:
  scores_it = []

  for spell_checked_sentence in permuted_sentences:
    scores_it.append(model.score(spell_checked_sentence))

  best_sentence_index_it = np.argmax(scores_it)

  final_transcript_it.append(permuted_sentences[best_sentence_index_it])

final_transcript_cs = []

for permuted_sentences in lanugage_model_input_cs:
  scores_cs = []

  for spell_checked_sentence in permuted_sentences:
    scores_cs.append(model.score(spell_checked_sentence))

  best_sentence_index_cs = np.argmax(scores_cs)

  final_transcript_cs.append(permuted_sentences[best_sentence_index_cs])

In [None]:
# for i, predicted_sentence in enumerate(zip(final_transcript_fi, final_transcript_it, final_transcript_cs)):
#   print("-" * 100)
#   print("Reference:", final_ref_proccesed[i].lower())
#   print("FI Prediction:", predicted_sentence[0].lower())
#   print("IT Prediction:", predicted_sentence[1].lower())
#   print("CS Prediction:", predicted_sentence[2].lower())

# End Try

In [11]:
def sort_dict(dictionary):
    sorted_dict = {k: dictionary[k] for k in
                   sorted(dictionary, key=dictionary.get, reverse=True)}
    return sorted_dict

In [12]:
def damerau_levenshtein_distance(word1, word2):
    distances = {}  # creates array, in dictionary form, in order to calculate d-l distance
    for w1 in range(-1, len(word1) + 1):  # creates first row values
        distances[(w1, -1)] = w1 + 1
    for w2 in range(-1, len(word2) + 1):  # creates first column values
        distances[(-1, w2)] = w2 + 1
    for w1 in range(len(word1)):  # creates values for the rest of the "array"
        for w2 in range(len(word2)):
            if word1[w1] == word2[w2]:
                point = 0
            else:
                point = 1
            a = distances[(w1, w2 - 1)] + 1
            b = distances[(w1 - 1, w2)] + 1
            c = distances[(w1 - 1, w2 - 1)] + point
            distances[(w1, w2)] = min(min(a, b), min(b, c))
            if w1 and w2 and word1[w1] == word2[w2 - 1] and word1[w1 - 1] == word2[w2]:
                d = distances[(w1, w2)]
                e = distances[(w1 - 2, w2 - 2)] + point
                distances[(w1, w2)] = min(d, e)
    return int(distances[(len(word1) - 1, len(word2) - 1)])  # returns final "array" value which gives the d-l distance

In [13]:
def check_in_gram_dict(sorted_potentials, gram_dictionary, final_sentence):
  new_potentials = dict()
  gram_keys = list(gram_dictionary.keys())
  for potential_k in sorted_potentials.keys():
    for gram_key in gram_keys:
      if potential_k == gram_key[-1] and final_sentence[-1] == gram_key[-2]:
        new_potentials[potential_k] = gram_dictionary[gram_key]
  if len(new_potentials) > 0:
    sorted_new_potentials = sort_dict(new_potentials)
    sorted_pot_keys = list(sorted_new_potentials.keys())
    return sorted_pot_keys[0]
  else:
    return False

In [14]:
unigram_dict = {}
with open("lm/hu_50k.txt") as f:
    for line in f:
       (key, val) = line.split()
       unigram_dict[key] = val

In [15]:
bigram_dict = {}
with open("lm/final_bigrams1.txt", encoding="utf8") as f:
    for line in f:
       temp = line.split()
       bigram_dict[temp[0], temp[1]] = temp[2]

In [16]:
def spellcheck(sent):
    nsent = []

    for input_word in sent:
      if input_word in unigram_dict.keys():
          nsent.append(input_word)
      else:
        potential_words = dict()
        for hu_word in unigram_dict.keys():
          dist = damerau_levenshtein_distance(input_word, hu_word)
          if dist < 3: #we can edit this as necessary
            potential_words[hu_word] = unigram_dict[hu_word]

        sorted_potents = sort_dict(potential_words)
        sorted_pot_keys = list(sorted_potents.keys())

        if len(potential_words) > 0:
          if len(nsent) > 1:
            tri_bis = check_in_gram_dict(sorted_potents, bigram_dict, nsent)
            if tri_bis == False:
              nsent.append(sorted_pot_keys[0])
            else:
              nsent.append(tri_bis)
          else:
            nsent.append(sorted_pot_keys[0])
        else:
          nsent.append(input_word)

    return " ".join(nsent)

In [17]:
it = []
fi = []
cs = []

In [19]:
for sentence in it_spellcheck:
    sent_it = sentence.split()
    it.append(spellcheck(sent_it))

for sentence in fi_spellcheck:
    sent_fi = sentence.split()
    fi.append(spellcheck(sent_fi))

for sentence in cz_spellcheck:
    sent_cs = sentence.split()
    cs.append(spellcheck(sent_cs))

In [18]:
ref = [x.lower() for x in test_dataset["sentence"]]

In [20]:
from difflib import SequenceMatcher
import random

In [21]:
from collections import defaultdict

def choice_dict(choices):
  count_dict = defaultdict(int)

  for item in choices:
    if isinstance(item, str):
      count_dict[item] += 1
    else:
      for c in item:
        count_dict[c] += 1

  return count_dict

In [22]:
def make_choice(the_dict):
  sorted_count = sort_dict(the_dict)
  keys = list(sorted_count.keys())
  values = list(sorted_count.values())

  if len(values) > 1:
    for i, v in enumerate(values):
      if i == 0:
        if v == values[i+1]:
          return str(random.choice((keys[0], keys[1]))) #currently returns random choice from the tied keys
        else:
          return keys[0]
  else:
    return keys[0]

In [23]:
def sentence_processing(it_sent, fi_sent, cz_sent=None, reference=None): #takes strings as input, only req 2 langs #USE THIS ONE
  l12_sent = []

  final_output = ' '

  if cz_sent: #when 3 usable sents
    it_fi = SequenceMatcher(None, it_sent, fi_sent)
    fi_cz = SequenceMatcher(None, fi_sent, cz_sent)
    cz_it = SequenceMatcher(None, cz_sent, it_sent)

    it_fi_sent = []
    fi_cz_sent = []
    cz_it_sent = []

    for op, a_start, a_end, b_start, b_end in it_fi.get_opcodes():
      frag_it = {it_sent[a_start:a_end]}
      frag_fi = {fi_sent[b_start:b_end]}

      if op == "equal":
        for item in frag_it:
          it_fi_sent.append(item)
      elif op == "replace":
        for i in frag_it:
          for j in frag_fi:
            it_fi_sent.append((i,j))

    for op, a_start, a_end, b_start, b_end in fi_cz.get_opcodes():
      frag_fi = {fi_sent[a_start:a_end]}
      frag_cz = {cz_sent[b_start:b_end]}

      if op == "equal":
        for item in frag_fi:
          fi_cz_sent += item
      elif op == "replace":
        for i in frag_fi:
          for j in frag_cz:
            fi_cz_sent.append((i,j))

    for op, a_start, a_end, b_start, b_end in cz_it.get_opcodes():
      frag_cz = {cz_sent[a_start:a_end]}
      frag_it = {it_sent[b_start:b_end]}

      if op == "equal":
        for item in frag_cz:
          cz_it_sent += item
      elif op == "replace":
        for i in frag_cz:
          for j in frag_it:
            cz_it_sent.append((i,j))

    for i, item in enumerate(zip(it_fi_sent, fi_cz_sent, cz_it_sent)):
      it_fi_item = item[0]
      fi_cz_item = item[1]
      cz_it_item = item[2]


      if isinstance(it_fi_item, str) and isinstance(
        fi_cz_item, str) and isinstance(cz_it_item, str): #when all agree
          final_output += it_fi_item


      elif isinstance(it_fi_item, tuple) and isinstance(
          fi_cz_item, tuple) and isinstance(cz_it_item, tuple): #when none agree
          it_choice = it_fi_item[0]
          fi_choice = fi_cz_item[0]
          cz_choice = cz_it_item[0]

          c_dict = choice_dict([it_choice, fi_choice, cz_choice])
          final_choice = make_choice(c_dict)

          final_output += final_choice

      elif isinstance(it_fi_item, tuple) and isinstance(fi_cz_item, tuple):
        it_choice = it_fi_item[0]
        fi_choice = fi_cz_item[0]
        cz_choice = cz_it_item

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

      elif isinstance(fi_cz_item, tuple) and isinstance(cz_it_item, tuple):
        it_choice = it_fi_item
        fi_choice = fi_cz_item[0]
        cz_choice = cz_it_item[0]

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

      elif isinstance(cz_it_item, tuple) and isinstance(it_fi_item, tuple):
        it_choice = it_fi_item[0]
        fi_choice = fi_cz_item
        cz_choice = cz_it_item[0]

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

      elif isinstance(cz_it_item, tuple):
        it_choice = it_fi_item
        fi_choice = fi_cz_item
        cz_choice = cz_it_item[0]

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

      elif isinstance(fi_cz_item, tuple):
        it_choice = it_fi_item
        fi_choice = fi_cz_item[0]
        cz_choice = cz_it_item

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

      elif isinstance(it_fi_item, tuple):
        it_choice = it_fi_item[0]
        fi_choice = fi_cz_item
        cz_choice = cz_it_item

        c_dict = choice_dict([it_choice, fi_choice, cz_choice])
        final_choice = make_choice(c_dict)

        final_output += final_choice

  else: #only 2 usabe sents
      it_fi = SequenceMatcher(None, it_sent, fi_sent)
      it_fi_sent = []

      for op, a_start, a_end, b_start, b_end in it_fi.get_opcodes():
        frag_it = {it_sent[a_start:a_end]}
        frag_fi = {fi_sent[b_start:b_end]}

        if op == "equal":
          for item in frag_it:
            final_output += item

        elif op == "replace":
          for i in frag_it:
            for j in frag_fi:
              final_output += str(random.choice([i, j]))


  return final_output

In [87]:
all_final = []
final_ref = []

# final_transcript_fi, final_transcript_it, final_transcript_cs

# for item in enumerate(zip(it, fi, cs, ref)): #this takes lists (of output)
for item in enumerate(zip(final_transcript_it, final_transcript_fi, final_transcript_cs, ref)): #this takes lists (of output)
    it_sent = item[1][0]
    fi_sent = item[1][1]
    cz_sent = item[1][2]
    ref_sent = item[1][3]

    if it_sent[0] == " ":
        it_sent = it_sent[1:]
    if it_sent[-1] == " ":
        it_sent = it_sent[:-1]

    if fi_sent[0] == " ":
        fi_sent = fi_sent[1:]
    if fi_sent[-1] == " ":
        fi_sent = fi_sent[:-1]

    if cz_sent[0] == " ":
        cz_sent = cz_sent[1:]
    if cz_sent[-1] == " ":
        cz_sent = cz_sent[:-1]

    a = len(it_sent)
    b = len(fi_sent)
    c = len(cz_sent)

    percent_diff = (max(a, b, c)-min(a, b, c))/max(a, b, c) #between longest and shortest input

    if percent_diff > .5:
        shortest = min(a, b, c)
        if a == shortest:
            final_output = sentence_processing(fi_sent, cz_sent)
            #print("Italian")
        if b == shortest:
            final_output = sentence_processing(it_sent, cz_sent)
            #print("Finnish")
        if c == shortest:
            final_output = sentence_processing(it_sent, fi_sent)
            #print("Czech")

    else:
        final_output = sentence_processing(it_sent, fi_sent, cz_sent, ref_sent)


    final_ref.append(ref_sent)
    all_final.append(final_output)
    print(f"Reference: {ref_sent}")
    print(f"Final: {final_output}")
    print(f"It: {it_sent}")
    print(f"Fi: {fi_sent}")
    print(f"Cz: {cz_sent}")
    print("*"*25)

Reference: könyökhajlásba kapta a maláj nyakát, és a dereka mellé szorította.
Final:  konak kokat ail yscome ally sorpzata
It: fognak hokage rock ott alig north il scederecome ally sorozata
Fi: könjukhoilays bok loptak milan sokat aisha derekomälli sor torta
Cz: konok honlap bo costa hol sokat i saderecomely sor torta
*************************
Reference: de uram teremtőm, hogy megtépázta a zuhogó eső meg a vad szél!
Final:  ha uram turemt htärtag p t ihogy s ogombot sag
It: ha uram teremt hold mik tag part sehogy sh gombot said
Fi: jo uram tärämtööm hit miatti post ahogy ss maga vonni
Cz: da uram tartom hoz mak te part oshogo ashumagowod sem
*************************
Reference: azt te tudod a legjobban, apjuk!
Final:  astan apeuk
It: ho start da nagyon ap uk
Fi: asteetutotäläkiopan apjuk
Cz: o este tu oldala gibson of jut
*************************
Reference: a kerék is jobban forog, ha kenik.
Final:  ak i fogok hakni
It: a ke rak six fogok haladni
Fi: akäriakis jobban fogok hatni
Cz: o

In [42]:
CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
                   "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
                   "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
                   "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
                   "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "ʻ", "ˆ"]

In [43]:
import re

chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

In [37]:
def extract_text(batch):
  text = batch["text"]
  batch["text"] = re.sub(chars_to_ignore_regex, "", text.lower())
  return batch

In [93]:
final_ref_proccesed = []

for sent in final_ref:
    final_ref_proccesed.append(re.sub(chars_to_ignore_regex, "", sent))

In [25]:
from datasets import load_metric

In [26]:
wer = load_metric("wer")
cer = load_metric("cer")

In [None]:
final_transcript_fi, final_transcript_it, final_transcript_cs

In [94]:
wer_score = 100 * wer.compute(predictions=all_final, references=final_ref_proccesed)
cer_score = 100 * cer.compute(predictions=all_final, references=final_ref_proccesed)

In [None]:
print(f"WER: {wer_score:.2f}%")
print(f"CER: {cer_score:.2f}%")

In [None]:
wer_score = 100 * wer.compute(predictions=final_transcript_it, references=final_ref_proccesed)
cer_score = 100 * cer.compute(predictions=final_transcript_it, references=final_ref_proccesed)

print(f"WER IT: {wer_score}%")
print(f"CER IT: {cer_score:.2f}%")

In [None]:
wer_score = 100 * wer.compute(predictions=predicted_sentences_fi[:40], references=final_ref_proccesed)
cer_score = 100 * cer.compute(predictions=predicted_sentences_fi[:40], references=final_ref_proccesed)

print(f"WER FI: {wer_score:.2f}%")
print(f"CER FI: {cer_score:.2f}%")

In [None]:
wer_score = 100 * wer.compute(predictions=final_transcript_cs, references=final_ref_proccesed)
cer_score = 100 * cer.compute(predictions=final_transcript_cs, references=final_ref_proccesed)

print(f"WER CS: {wer_score}%")
print(f"CER CS: {cer_score:.2f}%")