### Imports

In [None]:
import glob
import os
import warnings
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from transformers import AutoTokenizer
from tokenizer_exploration_utils import (
    analyze_UD_file,
    get_meta_data_for_languages,
    plot_set_continuation,
    plot_continuation,
    plot_fertility,
    plot_proportion_continuation,
    plot_proportion_unks,
    plot_dist_length
)

warnings.filterwarnings("ignore")

### Load tokenizers

In [None]:
mecab_dir = "<path/to/mecab/etc/mecabrc>"
mecab_dic_dir = "<path/to/mecab-ipadic-20070801>"

monolingual_tokenizers = {
    "ar": AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv01", do_lower_case=False),
    "en": AutoTokenizer.from_pretrained("bert-base-cased"), 
    "fi": AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1"),
    "id": AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", do_lower_case=True),
    "ja": AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char", 
            mecab_kwargs={
                "mecab_option": f"-r {mecab_dir} -d {mecab_dic_dir}"
            }),
    "ko": AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424"),
    "ru": AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased"),
    "tr": AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased"),
    "zh": AutoTokenizer.from_pretrained("bert-base-chinese")
}
multilingual_tokenizers = {
    "mBERT": AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
}

### Define UD dictionary with corresponding datasets files
In these dictionaries we will load the tokenized data.  
First for each of the language's tokenizer: `language_ud_dict`  
Second for the mBERT tokenizer: `mBERT_language_ud_dict`  

In [None]:
# Assumes path structure like: 'data/ud-data/ar/UD_Arabic-NYUAD/ar_nyuad-ud-train.conllu'
data_dir = "data/ud-data"
languages = ["ar", "en", "fi", "id", "ja", "ko", "ru", "tr", "zh"]

language_ud_dict = {}
mBERT_ud_dict = {} 
for l in languages:
    # find all dev and train files for given language
    l_files = glob.glob(os.path.join(data_dir, l, "*", "*dev.conllu"))
    l_files.extend(glob.glob(os.path.join(data_dir, l, "*", "*train.conllu")))
    # add files to dictionaries
    language_ud_dict[l] = {"files": l_files}
    mBERT_ud_dict[l] = {"files": l_files}

## Dataset loader for the UD datasets
Here we loop through all languages and corresponding files and tokenize the data

### Load the UD data for the monolingual models

In [None]:
get_meta_data_for_languages(language_ud_dict, monolingual_tokenizers)

### Load the UD data for mBERT

In [None]:
get_meta_data_for_languages(mBERT_ud_dict, multilingual_tokenizers)

# UD Plots 

## UD --- Proportion of Continued Words
Proportion of words that are split at least into two tokens

In [None]:
continuation_df = plot_proportion_continuation([language_ud_dict, mBERT_ud_dict])

##  UD --- Fertility
Average number of tokens a single word was split into

In [None]:
fertility_df = plot_fertility([language_ud_dict, mBERT_ud_dict])

## UD --- UNK Proportion
Proportion of tokens which are not represented in the vocabulary

In [None]:
unk_df = plot_proportion_unks([language_ud_dict, mBERT_ud_dict])

## UD --- Sentence length Plots 
Sentence length of actual sentence in the UD dataset vs sentence length when tokenized

In [None]:
plot_dist_length(language_ud_dict, mBERT_ud_dict)