# Elan ref/inf alignment

This notebook will take the output of Elpis + Huggingface inference notebook, plus some ref Elan files you have, and create ref/inf text alignments. 

Output is plain text format, text pairs.

In [1]:
%%capture 
!pip install pympi-ling
!pip install gdown

In [2]:
import gdown
import glob
from typing import List, Dict, Tuple, Union
from pympi.Elan import Eaf
from pathlib import Path

data_dir = Path("/content/eaf/")
output_dir = Path("/content/output/")

data_dir.mkdir(exist_ok=True)
output_dir.mkdir(exist_ok=True)

In [None]:
# This is the id for the eaf_ref_inf folder
id="1D3Fs_UTcxIqBVU90VI2_Egr-PBm4Wbgq"

gdown.download_folder(id=id, output=str(data_dir), quiet=False)

Rename files that came from inference notebook (they have model names in them)

In [4]:
import os

# add_model_names_here
strip_names = ["FYTM_fb_all", "add-model-names-here"]

inf_eaf_files = sorted(list(Path(data_dir).glob('*_inf.eaf')))

for filename in inf_eaf_files:
    print(filename) 
    for model_name in strip_names:
        if model_name in str(filename):
            new_filename = str(filename).replace(f"_{model_name}", "")
            os.rename(filename, new_filename)


/content/eaf/ZMS_EIP_010_Pronoun_FYTM_fb_all_inf.eaf
/content/eaf/ZMS_EIP_011_Millions_FYTM_fb_all_inf.eaf


In [5]:
eaf_files = sorted(list(Path(data_dir).glob('*.eaf')))


In [6]:
file_groups = {}  
for eaf_file in eaf_files:
    basename = eaf_file.stem[:-4]
    group = file_groups.get(basename, {"ref_ann": None, "inf_ann": None})
    if "ref" in eaf_file.stem:
        group["ref"] = eaf_file
        # group["ref_text"] = get_file_text(txt_file)
    elif "inf" in eaf_file.stem:
        group["inf"] = eaf_file
        # group["inf_text"] = get_file_text(txt_file)

    file_groups[basename] = group

file_groups

{'ZMS_EIP_010_Pronoun': {'ref_ann': None,
  'inf_ann': None,
  'inf': PosixPath('/content/eaf/ZMS_EIP_010_Pronoun_inf.eaf'),
  'ref': PosixPath('/content/eaf/ZMS_EIP_010_Pronoun_ref.eaf')},
 'ZMS_EIP_011_Millions': {'ref_ann': None,
  'inf_ann': None,
  'inf': PosixPath('/content/eaf/ZMS_EIP_011_Millions_inf.eaf'),
  'ref': PosixPath('/content/eaf/ZMS_EIP_011_Millions_ref.eaf')}}

In [7]:
# Build output dirs 
for file_group in file_groups:
    print(file_group)
    file_group_output_dir = output_dir / file_group
    file_group_output_dir.mkdir(exist_ok=True)


ZMS_EIP_010_Pronoun
ZMS_EIP_011_Millions


In [8]:
for file_group in file_groups:
    print(file_group)

    ref_eaf_file = file_groups[file_group]["ref"]
    print(ref_eaf_file)


ZMS_EIP_010_Pronoun
/content/eaf/ZMS_EIP_010_Pronoun_ref.eaf
ZMS_EIP_011_Millions
/content/eaf/ZMS_EIP_011_Millions_ref.eaf


In [None]:
ref_tier_name = "orthog"
inf_tier_name = "default"

for file_group in file_groups:
    print(file_group)

    file_group_output_dir = output_dir / file_group

    ref_eaf_file = file_groups[file_group]["ref"]
    

    ref_eaf = Eaf(ref_eaf_file)
    tier_types: List[str] = list(ref_eaf.get_linguistic_type_names())
    tier_names: List[str] = list(ref_eaf.get_tier_names())

    print("REF")

    if ref_tier_name in tier_names: 
        # Get the ref annotations (start, end and text)
        ref_annotations = ref_eaf.get_annotation_data_for_tier(ref_tier_name)
        # Show me
        for ref_start, ref_end, ref_annotation in ref_annotations:
            print(ref_start, ref_end, ref_annotation)


    inf_eaf_file = file_groups[file_group]["inf"]

    inf_eaf = Eaf(inf_eaf_file)
    tier_names: List[str] = list(inf_eaf.get_tier_names())

    print("INF")

    if inf_tier_name in tier_names:
        # Get the ref annotations (start, end and text)
        inf_annotations = inf_eaf.get_annotation_data_for_tier(inf_tier_name)
        # Show me
        # for inf_start, inf_end, inf_annotation in inf_annotations:
        #     print(inf_start, inf_end, inf_annotation)
            

    for i, (ref_start, ref_end, ref_annotation) in enumerate(ref_annotations):
        joined_anns = []
        joined_inf_annotation = []

        for inf_start, inf_end, inf_annotation in inf_annotations:
            # Lazy way of grouping the inference annotations in the ref utterance group timings
            if inf_start >= ref_start and inf_end <= ref_end:
                joined_inf_annotation.append(inf_annotation)

        joined_anns.extend([ref_annotation, " ".join(joined_inf_annotation)])
        # print(joined_inf_annotation)

        for j, joined_ann in enumerate(joined_anns):

            text_file_path = file_group_output_dir / f"{file_group}_{i}_{j}.txt"
            # print(text_file_path)

            with open(text_file_path, "w") as text_file:
                text_file.write(joined_anns[j])
                # print(joined_anns[j])
                

In [10]:
%%capture
!zip -r /content/output.zip /content/output

In [11]:
from google.colab import files

files.download('/content/output.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Stick it back into Google Drive, so the qualitative review notebook can use it.

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!rm -rf /content/drive/MyDrive/Zara/eaf_ref_inf/output

In [18]:
!cp -r /content/output /content/drive/MyDrive/Zara/eaf_ref_inf