# FLORES+

[FLORES+ on Hugging Face](https://huggingface.co/datasets/openlanguagedata/flores_plus)

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [2]:
# Load FLORES+ 
flores_plus = load_dataset("openlanguagedata/flores_plus")

# Load FLORES dev split and convert to Pandas DataFrame
flores_plus_dev = load_dataset("openlanguagedata/flores_plus", split='dev').to_pandas()

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [3]:
flores_plus_dev = flores_plus_dev.rename(columns={'iso_15924':'language'})
print(flores_plus_dev.shape)
flores_plus_dev.head(2)

(217346, 11)


Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated
0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0
1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0


In [4]:
def parity(text, tokenizer):
    """ 
    Parity = # tokens in tokenized document / # number of words in original document 

    Note: The difference between parity and fertility is that partiy is at the DOCUMENT
          level, whereas fertility is at the CORPUS level. 

    Parameters
    ----------
        - text (str): Text for which you want to calculate parity score
        - tokenizer (tokenizer): Tokenizer you'd like to use 

    Returns
    -------
        - parity (float): Parity score 
        - tokenized (): Tokenized text 
    """
    tokenized = tokenizer.tokenize(text) # Note: Transformers typically doesn't remove stopwords 
    num_words = len(text.split()) 
    parity = len(tokenized) / num_words
    return parity, tokenized

def get_parities(dataset, model='microsoft/Phi-3.5-mini-instruct', name_for_csv='output'):
    """ 
    Get the parity score and tokens for each document (row) in the FLORES+ dataset.

    Parameters
    ----------
        - dataset (pd.DataFrame): Dataset for which you want to calculate parity scores 
        - model (str): Model for tokenization
        - name_for_csv (str): Name for csv 

    Returns
    -------
        - dataset_copy (pd.DataFrame): DataFrame of results
        - Makes dataframe of parity score and tokens for each language and outputs to csv file
          in current directory
    
    """
    tokenizer = AutoTokenizer.from_pretrained(model)
    parity_scores = []
    tokens = []
    for row_index in range(len(dataset)):
        text = dataset.loc[row_index, 'text']
        parity_score, tokenized = parity(text, tokenizer)

        parity_scores.append(parity_score)
        tokens.append(tokenized)

    dataset_copy = dataset.copy()
    dataset_copy['parity'] = pd.Series(parity_scores)
    dataset_copy['tokens'] = pd.Series(tokens)
    dataset_copy.to_csv(f'{name_for_csv}.csv', index=True)
    return dataset_copy 


In [5]:
# Test 
subset = flores_plus_dev.iloc[:3, ] # First 3 documents (rows)
testing = get_parities(subset)
testing.head()

Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated,parity,tokens
0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,5.977778,"[▁, ي, ق, ▁, أ, و, ر, و, ▁, س, ن, ي, ن, ،, ▁, ..."
1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,5.615385,"[▁, <0xDA>, <0xA4>, ن, ل, ي, ت, ي, ▁, ا, و, ت,..."
2,2,ace,Arab,achi1257,جاس ۳۹سي ݢريڤين مڤوق لندسن ڤاچو ليڠک ڤوه ۹:۳۰ ...,https://en.wikinews.org/wiki/Fighter_jet_crash...,wikinews,accident,yes,yes,1.0,6.272727,"[▁, ج, ا, س, ▁, <0xDB>, <0xB3>, <0xDB>, <0xB9>..."


In [6]:
models = ["google/flan-t5-xxl",
          "bigscience/mt0-xxl-mt",
          "CohereForAI/aya-101",
          "bigscience/bloomz-7b1",
          "microsoft/Phi-3.5-mini-instruct",
          "neulab/Pangea-7B",
          "google/gemma-7b",
          "google/gemma-2-9b",
          "meta-llama/Llama-3.2-1B-Instruct"]
names_for_csv = ['flan-t5-xxl',
                 'mt0-xxl-mt',
                 'aya-101',
                 'bloomz-7b1',
                 'Phi-3.5-mini-instruct',
                 'Pangea-7B',
                 'gemma-7b',
                 'gemma-2-9b', 
                 'Llama-3.2-1B-Instruct']

for i in tqdm(range(len(models)), desc="Processing Models", unit="model"):
    get_parities(flores_plus_dev, model=models[i], name_for_csv=names_for_csv[i])
    print(f"Done with {models[i]}")

Processing Models:  11%|█         | 1/9 [00:21<02:51, 21.41s/model]

Done with google/flan-t5-xxl


Processing Models:  22%|██▏       | 2/9 [00:45<02:39, 22.75s/model]

Done with bigscience/mt0-xxl-mt


Processing Models:  33%|███▎      | 3/9 [01:08<02:17, 22.98s/model]

Done with CohereForAI/aya-101


Processing Models:  44%|████▍     | 4/9 [01:29<01:51, 22.27s/model]

Done with bigscience/bloomz-7b1


Processing Models:  56%|█████▌    | 5/9 [01:47<01:22, 20.55s/model]

Done with microsoft/Phi-3.5-mini-instruct


Processing Models:  67%|██████▋   | 6/9 [02:12<01:06, 22.26s/model]

Done with neulab/Pangea-7B


Processing Models:  78%|███████▊  | 7/9 [02:28<00:40, 20.17s/model]

Done with google/gemma-7b


Processing Models:  89%|████████▉ | 8/9 [02:44<00:18, 18.74s/model]

Done with google/gemma-2-9b


Processing Models: 100%|██████████| 9/9 [03:07<00:00, 20.81s/model]

Done with meta-llama/Llama-3.2-1B-Instruct





In [15]:
dataframes = []
for model in names_for_csv:
    path = f'{model}.csv'
    dataframe = pd.read_csv(path)
    dataframe['model'] = model
    dataframes.append(dataframe)
combined = pd.concat(dataframes, axis=0, ignore_index=True)
combined.to_csv(f'all_model_parities.csv', index=True)

In [16]:
combined.head(3)

Unnamed: 0.1,Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated,parity,tokens,model
0,0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,2.155556,"['▁', 'يق', '▁', 'أورو', '▁', 'سنين،', '▁', 'ا...",flan-t5-xxl
1,1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,2.307692,"['▁', 'ڤنليتي', '▁', 'اوتام', '▁', 'خن', '▁', ...",flan-t5-xxl
2,2,2,ace,Arab,achi1257,جاس ۳۹سي ݢريڤين مڤوق لندسن ڤاچو ليڠک ڤوه ۹:۳۰ ...,https://en.wikinews.org/wiki/Fighter_jet_crash...,wikinews,accident,yes,yes,1.0,2.181818,"['▁', 'جاس', '▁', '۳۹سي', '▁', 'ݢريڤين', '▁', ...",flan-t5-xxl
