# Basic API

In [None]:
!pip install langdetect

In [None]:
!pip install datasets bert_score

In [None]:
# Basic API
import json
import os
from langdetect import detect
from datasets import load_metric

bertscore = load_metric("bertscore")

def load_doc(doc):
    f = open(doc)
    entries = json.load(f)
    return entries

def load_dir(path, gdrive_path, dir):
    all = []
    flatten = []
    for file in path:
        print(file)
        doc = load_doc(gdrive_path + dir + file)
        all.append(doc)
    flatten = [entry for doc in all for entry in doc]
    return flatten

def extract_arrs(entries):
    en_concepts = []
    en_examples = []
    fr_concepts = []
    fr_examples = []
    fr_tags = []
    for i in range(len(entries)):
        en_concepts.append(entries[i]["english_concept"])
        en_examples.append(entries[i]["english_example"])
        fr_concepts.append(entries[i]["french_concept"])
        fr_examples.append(entries[i]["french_example"])
        fr_tags.append(entries[i]["french_concept_tag"])
    return en_concepts, fr_concepts, fr_tags, en_examples, fr_examples

def lang_switcher(en_examples, fr_examples):
    new_examples = []
    miscellaneous_examples = []
    for i, (example_en, example_fr) in enumerate(zip(en_examples, fr_examples)):
        lang_en = detect(example_en)
        lang_fr = detect(example_fr)

        if lang_en == "en":
            new_example = {
                "id": i,
                "english_example": example_en,
		        "french_example": example_fr	
            }
            new_examples.append(new_example)

        elif lang_fr == "fr":
            new_example = {
                "id": i,
                "english_example": example_en,
		        "french_example": example_fr	
            }
            new_examples.append(new_example)

        elif lang_en == "fr":
            new_example = {
                "id": i,
                "english_example": example_fr,
		        "french_example": example_en	
            }
            new_examples.append(new_example)

        elif lang_fr == "en":
            new_example = {
                "id": i,
                "english_example": example_fr,
		        "french_example": example_en	
            }
            new_examples.append(new_example)

        else:
            miscellaneous_example = {
                "id": i,
                "english_example": example_en,
		        "french_example": example_fr
            },

            new_example = {
                "id": i,
                "english_example": example_en,
		        "french_example": example_fr
            }

            miscellaneous_examples.append(miscellaneous_example)
            new_examples.append(new_example)

    return new_examples, miscellaneous_examples

def save_corrected(en_concepts, fr_concepts, french_concept_tags, new_examples):
    dataset_objets = []

    for english_concept, french_concept, french_concept_tag, en_fr_example in zip(en_concepts, fr_concepts, french_concept_tags, new_examples):
        dataset_object = {
                "id": en_fr_example.get("id"),
                "english_concept": english_concept,
                "french_concept": french_concept,       
                "french_concept_tag": french_concept_tag,
                "english_example": en_fr_example.get("english_example"),
                "french_example": en_fr_example.get("french_example")
            }
        dataset_objets.append(dataset_object)
    file_name = 'corrected_lang_' + str(len(dataset_objets)) + '.json'
    with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
        json.dump(dataset_objets, outfile)
    
    return dataset_objets


def extract_faulty(doc):
    faulty_entries = []
    valid_entries = []
    for i, entry in enumerate(doc):
        if entry.get("english_example").startswith("ⓘCette phrase n'est pas une traduction de la phrase originale. "):
            faulty_entries.append(entry)
        elif entry.get("french_example").startswith("ⓘCette phrase n'est pas une traduction de la phrase originale. "):
            faulty_entries.append(entry)
        else:
            valid_entries.append(entry)
            
    return faulty_entries, valid_entries

def retrieve_mismatched(doc):
    matched_entries = []
    mismatched_entries = []
    for i, entry in enumerate(doc):
        results = bertscore.compute(predictions=[entry.get("french_example")], references=[entry.get("english_example")], lang="bert-base-multilingual-cased")
        value = results.get("f1")
        if value[0] < 85:
            mismatched_entries.append(entry)
        else:
            matched_entries.append(entry)
    return mismatched_entries, matched_entries

In [None]:
# Mount gdrive
from google.colab import drive

drive.mount('/content/gdrive')
gdrive_path = "/content/gdrive/MyDrive/GEMFR/"

Mounted at /content/gdrive


In [None]:
# Remove automatically created checkpoints that won't be read by json
!rm -rf `find -type d -name .ipynb_checkpoints`

# Make sure the removal was effective
print(os.listdir('gdrive/MyDrive/GEMFR'))
print(os.listdir('gdrive/MyDrive/GEMFR/concepts'))
print(os.listdir('gdrive/MyDrive/GEMFR/filtered'))
print(os.listdir('gdrive/MyDrive/GEMFR/scraped'))

# Get and extract dirs content
concepts_path_content = os.listdir('gdrive/MyDrive/GEMFR/concepts')
en_concepts_path_content = os.listdir('gdrive/MyDrive/GEMFR/en_concepts')
filtered_path_content = os.listdir('gdrive/MyDrive/GEMFR/filtered')
scraped_path_content = os.listdir('gdrive/MyDrive/GEMFR/scraped')
corrected_path_content = os.listdir('gdrive/MyDrive/GEMFR/corrected')

concepts_doc = load_dir(concepts_path_content, gdrive_path, "concepts/")
en_concepts_doc = load_dir(en_concepts_path_content, gdrive_path, "en_concepts/")
filtered_doc = load_dir(filtered_path_content, gdrive_path, "filtered/")
scraped_doc = load_dir(scraped_path_content, gdrive_path, "scraped/")
corrected_doc = load_dir(corrected_path_content, gdrive_path, "corrected/")
ref_doc = load_doc(gdrive_path + 'unique_entries_str_14.json')

# Extract concepts and phrases
en_concepts_filtered, fr_concepts_filtered, french_concept_tags_filtered, en_examples_filtered, fr_examples_filtered = extract_arrs(filtered_doc)

In [None]:
# Correct language switch
new_examples, miscellaneous = lang_switcher(en_examples_filtered, fr_examples_filtered)

# Save corrected
dataset_objs = save_corrected(en_concepts_filtered, fr_concepts_filtered, french_concept_tags_filtered, new_examples)

# Etract faulty entries where phrases begin with "ⓘCette phrase n'est pas une traduction de la phrase originale. "
faulty_entries, valid_entries = extract_faulty(dataset_objs)

# Get rid of too short concepts, which are faulty anyway
en_concepts, fr_concepts, french_concept_tags, en_examples, fr_examples = extract_arrs(valid_entries)
filtered_concepts = [concept for concept in en_concepts if len(concept) > 2]

In [None]:
# Save extractions
file_name = 'faulty_entries_' + str(len(faulty_entries)) + '.json'
with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
    json.dump(faulty_entries, outfile)
file_name = 'valid_entries_' + str(len(valid_entries)) + '.json'
with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
    json.dump(valid_entries, outfile)
file_name = 'filtered_concepts_' + str(len(filtered_concepts)) + '.json'
with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
    json.dump(filtered_concepts, outfile)

In [None]:
def retrieve_mismatched(doc):
    matched_entries = []
    mismatched_entries = []
    for i, entry in enumerate(doc):
        results = bertscore.compute(predictions=[entry.get("french_example")], references=[entry.get("english_example")], lang="bert-base-multilingual-cased")
        value = results.get("f1")
        print(value[0])
        print(entry)
        if value[0] >= 0.70:
            matched_entries.append(entry)
        else:
            mismatched_entries.append(entry)
    return mismatched_entries, matched_entries

mismatched_entries, matched_entries = retrieve_mismatched(valid_entries)

In [None]:
file_name = 'mismatched_entries_' + str(len(mismatched_entries)) + '.json'
with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
    json.dump(mismatched_entries, outfile)
file_name = 'matched_entries_' + str(len(matched_entries)) + '.json'
with open(gdrive_path + "corrected_2/" + file_name, 'w') as outfile:
    json.dump(matched_entries, outfile)