# Imports & setup

### Imports - general

In [1]:
import os
import re

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score

In [2]:
from create_Tatoeba_train_test import get_language_mappings, get_sentence_word_char_len

### Imports & setup - langid

In [3]:
from langid.langid import LanguageIdentifier, model

In [4]:
lang_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)

### Imports & setup - langdetect

In [5]:
from langdetect import DetectorFactory, detect_langs
DetectorFactory.seed = 7

In [6]:
def get_langdetect_prediction(this_string: str):
    """
    Run langdetect on a sentence and get the results.
    
    Args:
    this_string - str; the sentence to run through langdetect.
    
    Returns:
    result - langdetect.language.Language; the langdetect output.
    """
    assert isinstance(this_string, str), "this_string is not of type str."
    
    result = None
    if len(this_string)>0:
        try:
            result = detect_langs(this_string)[0]
        except:
            result = "Error"
            print(f"Error processing sentence: {this_string}")
    return result

def get_langdetect_lang(this_result):
    """
    Get the language-component from the langdetect results.
    
    Args:
    this_result - langdetect.language.Language; the langdetect output.
    
    Returns:
    result - str; 2-character identifier for the predicted language.
    """
    result = "!!"
    if this_result:
        try:
            result = this_result.lang
            result = result.split("-")[0] # mainly for zh-cn
        except:
            pass
    return result

def get_langdetect_prob(this_result):
    """
    Get the probability-component from the langdetect results.
    
    Args:
    this_result - langdetect.language.Language; the langdetect output.
    
    Returns:
    result - float; the probability for the predicted language.
    """
    result = 0.0
    if this_result:
        try:
            result = this_result.prob
        except:
            pass
    return result

In [7]:
language_mappings = get_language_mappings("language_mappings.csv")
iso_639_2_English = language_mappings[["ISO 639-2", "English Name"]].set_index(keys="ISO 639-2")["English Name"].to_dict()
iso_639_2_iso_639_1 = language_mappings[["ISO 639-2", "ISO 639-1"]].set_index(keys="ISO 639-2")["ISO 639-1"].to_dict()

ISO_639_2 = language_mappings.set_index("ISO 639-2")
ISO_639_2 = ISO_639_2["English Name"].to_dict()

ISO_639_1 = language_mappings.set_index("ISO 639-1")
ISO_639_1 = ISO_639_1["English Name"].to_dict()

In [8]:
bin_labels = ["1","2","3","4","5","6","7","8","9","10","11 to 16", "17 to 27", "28 to 48", "49 to 99"]
counter = 0
results = []

# Get predictions & calculate F1 scores

In [9]:
# %%time
# 11min for 2 samples
# overnight run (8+ hours) for 100 samples
for root, dirs, file in os.walk("output/"):
    if root == "output/":
        for i in file:
            train_sample = re.search(r"train", i)
            if train_sample:
                counter += 1
                print(f"Processing - file:{counter} - {i}")
                sample = pd.read_csv(f"output/{i}")
                sample["Language_English_Name"] = sample["Language"].map(iso_639_2_English)
                sample["Sentence_len"] = sample.apply(lambda row: get_sentence_word_char_len(row), axis=1)
                
                sample["Sentence_len_bin"] = pd.cut(
                    x=sample["Sentence_len"],
                    bins=[0,1,2,3,4,5,6,7,8,9,10,16,27,48,99],
                    labels=bin_labels
                )
                
                ### LANGID ###
                ##############
                
                sample["langid"] = sample.apply(lambda row: list(lang_identifier.classify(row["Sentence"])), axis=1)
                sample["langid_language"], sample["langid_score"] = sample["langid"].str
                sample.drop(columns=["langid"], inplace=True)
                # map Language to English Name & langid_language to English Name
                sample["Language_English_Name"] = sample["Language"].map(ISO_639_2)
                sample["langid_language_English_Name"] = sample["langid_language"].map(ISO_639_1)
                
                ### LANGDETECT ###
                ##################
                
                sample["langdetect"] = sample.apply(lambda row: get_langdetect_prediction(row["Sentence"]), axis=1)
                sample["langdetect_language"] = sample.apply(lambda row: get_langdetect_lang(row["langdetect"]), axis=1)
                sample["langdetect_score"] = sample.apply(lambda row: get_langdetect_prob(row["langdetect"]), axis=1)
                sample.drop(columns=["langdetect"], inplace=True)
                sample["langdetect_language_English_Name"] = sample["langdetect_language"].map(ISO_639_1)
                
                ### Get F1 ###
                ##############
                
                for this_bin in bin_labels:
                    bin_results = sample[sample["Sentence_len_bin"]==this_bin]
                    for lang in bin_results["Language_English_Name"].unique().tolist():
                        these_results = bin_results[(bin_results["Language_English_Name"]==lang)]
                        for prediction in ["langid", "langdetect"]:
                            these_labels = these_results["Language_English_Name"].unique().tolist()
                            this_f1 = f1_score(these_results["Language_English_Name"], these_results[f"{prediction}_language_English_Name"], labels=these_labels, average="weighted")
                            result = (this_bin, lang, prediction, round(this_f1,2))
                            results.append(result)
                
                print(f"Finished - file:{counter} - len(results):{len(results)}\n")
                
del file, bin_results, i, lang, prediction, result, root, sample, these_labels, these_results, this_bin, this_f1 # clean up vars & free up memory

Processing - file:1 - Tatoeba_stratify_train_2021-8-3_2122.csv
Finished - file:1 - len(results):336

Processing - file:2 - Tatoeba_stratify_train_2021-8-3_2123.csv
Finished - file:2 - len(results):672

Processing - file:3 - Tatoeba_stratify_train_2021-8-3_2124.csv
Error processing sentence: 咦！？
Finished - file:3 - len(results):1008

Processing - file:4 - Tatoeba_stratify_train_2021-8-3_2125.csv
Error processing sentence: 啥？
Finished - file:4 - len(results):1344

Processing - file:5 - Tatoeba_stratify_train_2021-8-3_2126.csv
Finished - file:5 - len(results):1680

Processing - file:6 - Tatoeba_stratify_train_2021-8-3_2127.csv
Error processing sentence: 嗯..
Finished - file:6 - len(results):2016

Processing - file:7 - Tatoeba_stratify_train_2021-8-3_2128.csv
Finished - file:7 - len(results):2352

Processing - file:8 - Tatoeba_stratify_train_2021-8-3_2129.csv
Finished - file:8 - len(results):2688

Processing - file:9 - Tatoeba_stratify_train_2021-8-3_2130.csv
Finished - file:9 - len(results

In [11]:
results = pd.DataFrame(data=results, columns=["Bin", "Language", "Source", "F1"])

In [24]:
from datetime import datetime

In [25]:
now = datetime.now()
datetime_stamp = f"{now.year}-{now.month}-{now.day}_{now.hour}{now.minute}"

In [28]:
results.to_csv(f"output/results_df_{datetime_stamp}.csv", index=False)

# End