# FLORES+

[FLORES+ on HuggingFace](https://huggingface.co/datasets/openlanguagedata/flores_plus)

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Load FLORES+ 
flores_plus = load_dataset("openlanguagedata/flores_plus")

# Load FLORES dev split and convert to Pandas DataFrame
flores_plus_dev = load_dataset("openlanguagedata/flores_plus", split='dev').to_pandas()

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [3]:
flores_plus_dev = flores_plus_dev.rename(columns={'iso_15924':'language'})
print(flores_plus_dev.shape)
flores_plus_dev.head(2)

(217346, 11)


Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated
0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0
1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0


In [4]:
# Get unique languages
print(f"There are {len(flores_plus_dev['language'].unique())} in the FLORES+ dataset:")
print(flores_plus_dev['language'].unique())

There are 31 in the FLORES+ dataset:
['Arab' 'Latn' 'Ethi' 'Beng' 'Deva' 'Cyrl' 'Tibt' 'Hans' 'Hant' 'Grek'
 'Gujr' 'Hebr' 'Armn' 'Jpan' 'Knda' 'Geor' 'Khmr' 'Hang' 'Laoo' 'Mlym'
 'Mtei' 'Mymr' 'Nkoo' 'Orya' 'Guru' 'Olck' 'Sinh' 'Taml' 'Tfng' 'Telu'
 'Thai']


In [5]:
languages = list(flores_plus_dev['language'].unique())
for i in range(len(languages)):
    language = languages[i]
    text = flores_plus_dev[flores_plus_dev['language']==language]['text']
    joined = " ".join(text)
    with open(f"language_corpora/{language}_corpus.txt", "w") as f:
        f.write(joined)

In [6]:
def fertility(text, tokenizer):
    tokenized = tokenizer.tokenize(text) # Note: Transformers typically doesn't remove stopwords 
    num_words = len(text.split())

    fertility = len(tokenized) / num_words
    return fertility, tokenized

In [7]:
def get_fertilities(model='microsoft/Phi-3.5-mini-instruct', name_for_csv='output'):
    """ 
    Get the fertility score and tokens for each language in the FLORES+ dataset.

    Parameters
    ----------
        - model (str): Model for tokenization
        - name_for_csv (str): Name for csv 

    Returns
    -------
    None. Makes dataframe of fertility score and tokens for each language
    and outputs to csv file. 
    
    """
    tokenizer=AutoTokenizer.from_pretrained(model)
    directory_path = "language_corpora"
    languages = []
    fertility_scores = []
    tokens = []
    for file in os.listdir(directory_path):
        language = file.rstrip('_corpus.txt')
        file_path = os.path.join(directory_path, file)  
        with open(file_path, "r") as corpus:
            text = corpus.read()
        
        fertility_score, tokenized = fertility(text, tokenizer)

        languages.append(language)
        fertility_scores.append(fertility_score)
        tokens.append(tokenized)


        df = pd.DataFrame({'language': pd.Series(languages),
                        'fertility': pd.Series(fertility_scores),
                        'tokens': pd.Series(tokens)})
        df.to_csv(f'{name_for_csv}.csv', index=True)


In [8]:
models = ["google/flan-t5-xxl",
          "bigscience/mt0-xxl-mt",
          "CohereForAI/aya-101",
          "bigscience/bloomz-7b1",
          "microsoft/Phi-3.5-mini-instruct",
          "neulab/Pangea-7B",
          "google/gemma-7b",
          "google/gemma-2-9b",
          "meta-llama/Llama-3.2-1B-Instruct"]
names_for_csv = ['flan-t5-xxl',
                 'mt0-xxl-mt',
                 'aya-101',
                 'bloomz-7b1',
                 'Phi-3.5-mini-instruct',
                 'Pangea-7B',
                 'gemma-7b',
                 'gemma-2-9b', 
                 'Llama-3.2-1B-Instruct']

for i in range(len(models)):
    get_fertilities(model=models[i], name_for_csv=names_for_csv[i])
    print(f'Done with {models[i]}')

Token indices sequence length is longer than the specified maximum sequence length for this model (1609459 > 512). Running this sequence through the model will result in indexing errors


Done with google/flan-t5-xxl
Done with bigscience/mt0-xxl-mt
Done with CohereForAI/aya-101
Done with bigscience/bloomz-7b1


Token indices sequence length is longer than the specified maximum sequence length for this model (1101189 > 131072). Running this sequence through the model will result in indexing errors


Done with microsoft/Phi-3.5-mini-instruct


Token indices sequence length is longer than the specified maximum sequence length for this model (1129839 > 8192). Running this sequence through the model will result in indexing errors


Done with neulab/Pangea-7B
Done with google/gemma-7b
Done with google/gemma-2-9b


Token indices sequence length is longer than the specified maximum sequence length for this model (1097366 > 131072). Running this sequence through the model will result in indexing errors


Done with meta-llama/Llama-3.2-1B-Instruct


In [9]:
# Make new dataset with columns ['model', 'language', 'fertility', 'tokens']
# Basically combine everything done so far
names_for_csv = ['flan-t5-xxl',
                 'mt0-xxl-mt',
                 'aya-101',
                 'bloomz-7b1',
                 'Phi-3.5-mini-instruct',
                 'Pangea-7B',
                 'gemma-7b',
                 'gemma-2-9b', 
                 'Llama-3.2-1B-Instruct']

dfs = []
for file in os.listdir(os.getcwd()):
    if file.rstrip('.csv') in names_for_csv: # Get correct files
        df = pd.read_csv(file).drop(columns=['Unnamed: 0'])
        df['model'] = file.rstrip('.csv')
        dfs.append(df)


merged_df = pd.concat(dfs, ignore_index=True)
merged_df.head()

Unnamed: 0,language,fertility,tokens,model
0,Cyrl,5.402301,"['▁', 'Дүшә', 'м', 'б', 'е', '▁', 'С', 'т', 'э...",flan-t5-xxl
1,Khm,5.829346,"['▁', 'កាល', '▁', 'ពី', '▁', 'ថ្ងៃ', '▁', 'ច័ន...",flan-t5-xxl
2,Nk,2.077179,"['▁', 'ߟߐ߲ߞߏߕߌ߮', '▁', 'ߟߎ߬', '▁', 'ߓߘߊ߫', '▁'...",flan-t5-xxl
3,Ge,2.193156,"['▁', 'ორშაბათს', ',', '▁', 'სტენფორდის', '▁',...",flan-t5-xxl
4,Heb,2.150794,"['▁', 'ביום', '▁', 'שני', ',', '▁', 'מדענים', ...",flan-t5-xxl


In [10]:
merged_df.to_csv('model_fertilities.csv')