# Experiment Logs
All results displayed with the following specification:
- Model = sentence-transformers/LaBSE
- Keep F1 Score Only

## Baseline Datasets

### Nusax

- Command ```python bitext.py --src_lang eng --dataset nusax --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE ```
- Note that src language is 'eng' instead of 'en'

In [2]:
import os
import json
import pandas as pd

# Define the path to the directory and initialize variables
directory_path = "outputs/save_bitext/nusax/sentence-transformers/LaBSE/seed_42" 
language_pairs = ['ace', 'ban', 'bbc', 'bjn', 'bug', 'ind', 'jav', 'mad', 'min', 'nij', 'sun']
k_values = ['1', '5', '10']

# Initialize an empty dictionary to store F1 scores for k=1,5,10 for each language pair
f1_scores = {k: [] for k in k_values}

# Iterate over the files in the directory and extract F1 scores
for lang in language_pairs:
    for k in k_values:
        file_name = f"eval_eng_{lang}_{k}.json"
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            f1_scores[k].append(data["f1"])

# Convert the dictionary to a DataFrame and calculate the average F1 scores
df = pd.DataFrame(f1_scores)
df.index = language_pairs
df.loc['Average'] = df.mean()

print(df)

                1         5        10
ace      0.708867  0.808400  0.842333
ban      0.828267  0.919000  0.926333
bbc      0.574229  0.695465  0.733300
bjn      0.893000  0.952333  0.963000
bug      0.440918  0.558769  0.625303
ind      0.976000  0.986667  0.986667
jav      0.976000  0.981333  0.984000
mad      0.593802  0.723538  0.801700
min      0.897133  0.950333  0.958333
nij      0.711605  0.793238  0.843400
sun      0.968333  0.979000  0.979000
Average  0.778923  0.849825  0.876670


### BUCC

- MINERS's bucc dataset has inverted source language, therefore 3 commands are required to run:
    - ```python bitext.py --src_lang de --dataset bucc --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE ```
    - ```python bitext.py --src_lang fr --dataset bucc --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE ```
    - ```python bitext.py --src_lang zh --dataset bucc --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE ```

In [3]:
directory_path = "outputs/save_bitext/bucc/sentence-transformers/LaBSE/seed_42"  
language_pairs = ['de','fr','zh']
k_values = ['1', '5', '10']

# Initialize an empty dictionary to store F1 scores for k=1,5,10 for each language pair
f1_scores = {k: [] for k in k_values}

# Iterate over the files in the directory and extract F1 scores
for lang in language_pairs:
    for k in k_values:
        file_name = f"eval_{lang}_en_{k}.json"
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            f1_scores[k].append(data["f1"])

# Convert the dictionary to a DataFrame and calculate the average F1 scores
df = pd.DataFrame(f1_scores)
df.index = language_pairs
df.loc['Average'] = df.mean()

print(df)

                1         5        10
de       0.994294  0.997495  0.998330
fr       0.989489  0.995304  0.995598
zh       0.992277  0.994383  0.994383
Average  0.992020  0.995727  0.996103


### NollySenti

- Command ```python bitext.py --src_lang en --dataset nollysenti --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE```

In [6]:
directory_path = "outputs/save_bitext/nollysenti/sentence-transformers/LaBSE/seed_42"
language_pairs = ['en400', 'ha', 'ig', 'pcm', 'yo400']
k_values = ['1', '5', '10']

# Initialize an empty dictionary to store F1 scores for k=1,5,10 for each language pair
f1_scores = {k: [] for k in k_values}

# Iterate over the files in the directory and extract F1 scores
for lang in language_pairs:
    for k in k_values:
        file_name = f"eval_en_{lang}_{k}.json"
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            f1_scores[k].append(data["f1"])

# Convert the dictionary to a DataFrame and calculate the average F1 scores
df = pd.DataFrame(f1_scores)
df.index = language_pairs
df.loc['Average'] = df.mean()

print(df)

                1         5        10
en400    0.227169  0.229447  0.232640
ha       0.227474  0.229752  0.229752
ig       0.208863  0.228777  0.232717
pcm      0.211215  0.218597  0.219969
yo400    0.146419  0.193499  0.204091
Average  0.204228  0.220015  0.223834


In [3]:
import os
import json
import pandas as pd
directory_path = "outputs/save_bitext/nollysenti/sentence-transformers/LaBSE/seed_42"
language_pairs = ['en400', 'ha', 'ig', 'pcm', 'yo400']
k_values = ['1', '5', '10']

# Initialize an empty dictionary to store F1 scores for k=1,5,10 for each language pair
f1_scores = {k: [] for k in k_values}

# Iterate over the files in the directory and extract F1 scores
for lang in language_pairs:
    for k in k_values:
        file_name = f"eval_en_{lang}_{k}.json"
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            f1_scores[k].append(data["f1"])

# Convert the dictionary to a DataFrame and calculate the average F1 scores
df = pd.DataFrame(f1_scores)
df.index = language_pairs
df.loc['Average'] = df.mean()

print(df)

                1         5        10
en400    0.283684  0.286529  0.291980
ha       0.284254  0.287099  0.287099
ig       0.261704  0.287028  0.291948
pcm      0.265010  0.274640  0.277166
yo400    0.185283  0.244512  0.258241
Average  0.255987  0.275962  0.281287


## Added Dataset

### Open Subtitles

- Command ```python bitext.py --src_lang en --dataset opensub --seed 42 --cuda --model_checkpoint sentence-transformers/LaBSE ```
- Credit: [loicmagne-HuggingFace](https://huggingface.co/datasets/loicmagne/open-subtitles-bitext-mining)

In [5]:
directory_path = "outputs/save_bitext/opensub/sentence-transformers/LaBSE/seed_42"
language_pairs = ['af']
k_values = ['1', '5', '10']

# Initialize an empty dictionary to store F1 scores for k=1,5,10 for each language pair
f1_scores = {k: [] for k in k_values}

# Iterate over the files in the directory and extract F1 scores
for lang in language_pairs:
    for k in k_values:
        file_name = f"eval_{lang}_en_{k}.json"
        file_path = os.path.join(directory_path, file_name)
        
        # Open and read the JSON file
        with open(file_path, "r") as file:
            data = json.load(file)
            f1_scores[k].append(data["f1"])

# Convert the dictionary to a DataFrame and calculate the average F1 scores
df = pd.DataFrame(f1_scores)
df.index = language_pairs
df.loc['Average'] = df.mean()

print(df)

                1         5        10
af       0.787919  0.863133  0.882233
Average  0.787919  0.863133  0.882233
