# FLORES+

[FLORES+ on HuggingFace](https://huggingface.co/datasets/openlanguagedata/flores_plus)

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

folder_path = '../../python_library'
sys.path.append(os.path.abspath(folder_path))
from token_scoring import *

In [2]:
# Load FLORES+ 
flores_plus = load_dataset("openlanguagedata/flores_plus")

# Load FLORES dev split and convert to Pandas DataFrame
flores_plus_dev = load_dataset("openlanguagedata/flores_plus", split='dev').to_pandas()

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/218 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [3]:
flores_plus_dev = flores_plus_dev.rename(columns={'iso_15924':'language'})
print(flores_plus_dev.shape)
flores_plus_dev.head(2)

(217346, 11)


Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated
0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0
1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0


In [4]:
# Get unique languages
print(f"There are {len(flores_plus_dev['language'].unique())} in the FLORES+ dataset:")
print(flores_plus_dev['language'].unique())

There are 31 in the FLORES+ dataset:
['Arab' 'Latn' 'Ethi' 'Beng' 'Deva' 'Cyrl' 'Tibt' 'Hans' 'Hant' 'Grek'
 'Gujr' 'Hebr' 'Armn' 'Jpan' 'Knda' 'Geor' 'Khmr' 'Hang' 'Laoo' 'Mlym'
 'Mtei' 'Mymr' 'Nkoo' 'Orya' 'Guru' 'Olck' 'Sinh' 'Taml' 'Tfng' 'Telu'
 'Thai']


In [5]:
models = ["google/flan-t5-xxl",
          "bigscience/mt0-xxl-mt",
          "CohereForAI/aya-101",
          "bigscience/bloomz-7b1",
          "microsoft/Phi-3.5-mini-instruct",
          "neulab/Pangea-7B",
          "google/gemma-7b",
          "google/gemma-2-9b",
          "meta-llama/Llama-3.2-1B-Instruct"]
names_for_csv = ['flan-t5-xxl',
                 'mt0-xxl-mt',
                 'aya-101',
                 'bloomz-7b1',
                 'Phi-3.5-mini-instruct',
                 'Pangea-7B',
                 'gemma-7b',
                 'gemma-2-9b', 
                 'Llama-3.2-1B-Instruct']

dfs = []
for i in range(len(models)):
    scored = get_parities(data=flores_plus_dev, 
                             tokenizer=AutoTokenizer.from_pretrained(models[i]), 
                             visualize=False)
    scored['model'] = models[i]
    dfs.append(scored)
    print(f'Done with {models[i]}')

Done with google/flan-t5-xxl
Done with bigscience/mt0-xxl-mt
Done with CohereForAI/aya-101
Done with bigscience/bloomz-7b1
Done with microsoft/Phi-3.5-mini-instruct
Done with neulab/Pangea-7B
Done with google/gemma-7b
Done with google/gemma-2-9b
Done with meta-llama/Llama-3.2-1B-Instruct


In [6]:
parities = pd.concat(dfs, ignore_index=True)
parities.head(2)

Unnamed: 0,id,iso_639_3,language,glottocode,text,url,domain,topic,has_image,has_hyperlink,last_updated,parity,tokens,model
0,0,ace,Arab,achi1257,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,2.155556,"[▁, يق, ▁, أورو, ▁, سنين،, ▁, اوق, ▁, علمون, ▁...",google/flan-t5-xxl
1,1,ace,Arab,achi1257,ڤنليتي اوتام خن اترا ڽو موڠکين محسى ديتيکسي فو...,https://en.wikinews.org/wiki/Scientists_say_ne...,wikinews,health,yes,yes,1.0,2.307692,"[▁, ڤنليتي, ▁, اوتام, ▁, خن, ▁, اترا, ▁, ڽو, ▁...",google/flan-t5-xxl


In [7]:
dfs = []
for i in range(len(models)):
    scored = get_fertilities(data=flores_plus_dev, 
                             tokenizer=AutoTokenizer.from_pretrained(models[i]), 
                             visualize=False)
    scored['model'] = models[i]
    dfs.append(scored)
    print(f'Done with {models[i]}')


fertilities = pd.concat(dfs, ignore_index=True)
fertilities.head(2)

Token indices sequence length is longer than the specified maximum sequence length for this model (911368 > 512). Running this sequence through the model will result in indexing errors


Done with google/flan-t5-xxl
Done with bigscience/mt0-xxl-mt
Done with CohereForAI/aya-101
Done with bigscience/bloomz-7b1


Token indices sequence length is longer than the specified maximum sequence length for this model (2505498 > 131072). Running this sequence through the model will result in indexing errors


Done with microsoft/Phi-3.5-mini-instruct


Token indices sequence length is longer than the specified maximum sequence length for this model (1397776 > 8192). Running this sequence through the model will result in indexing errors


Done with neulab/Pangea-7B
Done with google/gemma-7b
Done with google/gemma-2-9b


Token indices sequence length is longer than the specified maximum sequence length for this model (1319075 > 131072). Running this sequence through the model will result in indexing errors


Done with meta-llama/Llama-3.2-1B-Instruct


Unnamed: 0,language,corpus,fertility,tokens,model
0,Arab,يق أورو سنين، اوق علمون دري فکولتس کدوکترن يون...,2.078433,"[▁, يق, ▁, أورو, ▁, سنين،, ▁, اوق, ▁, علمون, ▁...",google/flan-t5-xxl
1,Latn,"Bak uroe Senin, awak ilmuwan dari Fakultas Ked...",3.436214,"[▁Bak, ▁, ur, o, e, ▁Sen, in, ,, ▁, a, wak, ▁,...",google/flan-t5-xxl


In [8]:
parities.to_csv('model_parities_using_library.csv')
fertilities.to_csv('model_fertilities_using_library.csv')

In [11]:
fertilities.groupby('model')['fertility'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CohereForAI/aya-101,31.0,6.329727,7.314511,2.104427,2.600421,3.012729,6.10889,30.143799
bigscience/bloomz-7b1,31.0,11.939275,12.950681,1.514487,2.258471,6.058881,16.073491,44.211665
bigscience/mt0-xxl-mt,31.0,6.329727,7.314511,2.104427,2.600421,3.012729,6.10889,30.143799
google/flan-t5-xxl,31.0,2.602669,1.048444,2.040513,2.090918,2.14366,2.222407,5.829346
google/gemma-2-9b,31.0,8.4769,7.593213,2.12074,3.635907,5.59138,9.177739,28.918156
google/gemma-7b,31.0,8.4769,7.593213,2.12074,3.635907,5.59138,9.177739,28.918156
meta-llama/Llama-3.2-1B-Instruct,31.0,16.422128,14.790794,2.402349,8.412023,12.358785,16.518693,72.014839
microsoft/Phi-3.5-mini-instruct,31.0,19.731495,16.967171,2.618073,8.457879,12.159142,25.78927,62.21731
neulab/Pangea-7B,31.0,13.045482,10.661878,2.400613,6.380682,9.187998,16.370472,48.148746
