In [23]:
import huggingface_hub
from tqdm.auto import tqdm

size_data = {}

for dataset_info in tqdm(huggingface_hub.list_datasets(author="mteb-pt", full=False)):
    try:
        full_data = huggingface_hub.dataset_info(dataset_info.id, revision='refs/convert/parquet', files_metadata=True)
    except:
        full_data = huggingface_hub.dataset_info(dataset_info.id, files_metadata=True)
    total_size = 0
    for s in full_data.siblings:
        total_size += s.size

    #MB total_size / 1024
    #GB total_size / 1024 / 1024
    size_data[dataset_info.id] = total_size

0it [00:00, ?it/s]

In [31]:
x = list(size_data.keys())
x.sort(key=lambda x: size_data[x], reverse=False)

In [44]:
TASK_LIST_CLASSIFICATION = [
    "AmazonCounterfactualClassification",
    "AmazonPolarityClassification",
    "AmazonReviewsClassification",
    "Banking77Classification",
    "EmotionClassification",
    "ImdbClassification",
    "MassiveIntentClassification",
    "MassiveScenarioClassification",
    "MTOPDomainClassification",
    "MTOPIntentClassification",
    "ToxicConversationsClassification",
    "TweetSentimentExtractionClassification",
]

TASK_LIST_CLUSTERING = [
    "ArxivClusteringP2P",
    "ArxivClusteringS2S",
    "BiorxivClusteringP2P",
    "BiorxivClusteringS2S",
    "MedrxivClusteringP2P",
    "MedrxivClusteringS2S",
    "RedditClustering",
    "RedditClusteringP2P",
    "StackExchangeClustering",
    "StackExchangeClusteringP2P",
    "TwentyNewsgroupsClustering",
]

TASK_LIST_PAIR_CLASSIFICATION = [
    "SprintDuplicateQuestions",
    "TwitterSemEval2015",
    "TwitterURLCorpus",
]

TASK_LIST_RERANKING = [
    "AskUbuntuDupQuestions",
    "MindSmallReranking",
    "SciDocsRR",
    "StackOverflowDupQuestions",
]

TASK_LIST_RETRIEVAL = [
    "ArguAna",
    "ClimateFEVER",
    "CQADupstackAndroidRetrieval",
    "CQADupstackEnglishRetrieval",
    "CQADupstackGamingRetrieval",
    "CQADupstackGisRetrieval",
    "CQADupstackMathematicaRetrieval",
    "CQADupstackPhysicsRetrieval",
    "CQADupstackProgrammersRetrieval",
    "CQADupstackStatsRetrieval",
    "CQADupstackTexRetrieval",
    "CQADupstackUnixRetrieval",
    "CQADupstackWebmastersRetrieval",
    "CQADupstackWordpressRetrieval",
    "DBPedia",
    "FEVER",
    "FiQA2018",
    "HotpotQA",
    "MSMARCO",
    "NFCorpus",
    "NQ",
    "QuoraRetrieval",
    "SCIDOCS",
    "SciFact",
    "Touche2020",
    "TRECCOVID",
]

TASK_LIST_STS = [
    "BIOSSES",
    "SICK-R",
    "STS12",
    "STS13",
    "STS14",
    "STS15",
    "STS16",
    "STS17",
    "STS22",
    "STSBenchmark",
    "SummEval",
]

TASK_LIST_RETRIEVAL_LAW = [
    "LegalSummarization",
    "LegalBenchConsumerContractsQA",
    "LegalBenchCorporateLobbying",
    "AILACasedocs",
    "AILAStatutes",
    "LeCaRDv2",
    "LegalQuAD",
    "GerDaLIRSmall",
]

TASK_JINAAI = [
    "BigPatentClustering",
    "WikiCitiesClustering",
    "NarrativeQA"
]

TASKS_LEMB = [
    "LEMBNarrativeQARetrieval",
    "LEMBNeedleRetrieval",
    "LEMBPasskeyRetrieval",
    "LEMBQMSumRetrieval",
    "LEMBSummScreenFDRetrieval",
    "LEMBWikimQARetrieval"
]

TASK_LIST = (
    TASK_LIST_CLASSIFICATION
    + TASK_LIST_CLUSTERING
    + TASK_LIST_PAIR_CLASSIFICATION
    + TASK_LIST_RERANKING
    + TASK_LIST_RETRIEVAL
    + TASK_LIST_STS
    + TASK_LIST_RETRIEVAL_LAW
    + TASK_JINAAI
    + TASKS_LEMB
)

In [46]:
import mteb

In [47]:
import inspect
TASKS_TO_CONVERT = {}
class_name_to_task_name = {}
for task in mteb.get_tasks():
    name = task.metadata.name
    class_name = type(task).__name__
    if name in TASK_LIST or class_name in TASK_LIST:
        TASKS_TO_CONVERT[name] = {
            "name": name,
            "class_name": class_name,
            "path": inspect.getfile(task.__class__)
        }
    #class_name_to_task_name[class_name] = name
    #print(name, path)

In [43]:
TASKS_PT = [
    'BiossesSTS',
    "SickrSTS",
    "STS15STS",
    "STS16STS",
    #"STSBenchmarkMultilingualSTS", # has pt
    "AskUbuntuDupQuestions", #Rerank
    "StackOverflowDupQuestions", #Rerank
    "SprintDuplicateQuestionsPC", # pair cla
    "TwitterSemEval2015PC", #Pair cls
    "SummEvalSummarization", #Summarization
    "TwentyNewsgroupsClustering", #cluste
    "MedrxivClusteringS2S", #clust
    "Banking77Classification",
    #"MassiveIntentClassification", #haspt
    #"MassivScenarioClassification", #haspt
    "SciFactRetrieval",
    "CQADupstackStatsRetrieval",
    #"MultiLongDocRetrieval" # has pt
]

new_map = {}
for task in TASKS_PT:
    if task in class_name_to_task_name:
        if task != class_name_to_task_name[task]:
            new_map['PTTrad'+task] = 'PTTrad'+class_name_to_task_name[task]
new_map

{'PTTradBiossesSTS': 'PTTradBIOSSES',
 'PTTradSickrSTS': 'PTTradSICK-R',
 'PTTradSTS15STS': 'PTTradSTS15',
 'PTTradSTS16STS': 'PTTradSTS16',
 'PTTradSprintDuplicateQuestionsPC': 'PTTradSprintDuplicateQuestions',
 'PTTradTwitterSemEval2015PC': 'PTTradTwitterSemEval2015',
 'PTTradSummEvalSummarization': 'PTTradSummEval'}

In [58]:
import shutil

basepath = "/mnt/e/ceia/mteb/mteb/mteb/tasks"
for folder in os.listdir(basepath):
    lang_task_folder = os.path.join(basepath, folder, "por_translated")
    if os.path.exists(lang_task_folder):
        shutil.rmtree(lang_task_folder)

In [59]:
import glob
import os
import shutil

basepath = "/mnt/e/ceia/mteb/mteb/mteb/tasks"
for folder in os.listdir(basepath):
    lang_task_folder = os.path.join(basepath, folder, "por_translated")
    if os.path.exists(lang_task_folder):
        shutil.rmtree(lang_task_folder)

for task_name, data in TASKS_TO_CONVERT.items():
    original_filepath = data["path"]
    class_name = data["class_name"]
    if '/eng/' in original_filepath:
        lang = 'eng'
    elif '/multilingual/' in original_filepath:
        lang = 'multilingual'
    else:
        print(task_name)
        continue
    original_dir = os.path.dirname(original_filepath)
    new_dir = original_dir.replace(f'/{lang}', '/por_translated')
    new_name = "PTT_"+task_name
    new_class_name = "PTT_"+class_name
    new_filepath = os.path.join(new_dir, new_class_name+'.py')
    os.makedirs(new_dir, exist_ok=True)

    print(f'cp {original_filepath} {new_filepath}')
    os.system(f'cp {original_filepath} {new_filepath}')
    with open(new_filepath, 'r') as f:
        doc = f.read()
    
    """
    if task_name != class_name:
        if len(task_name) > len(class_name):
            doc = doc.replace(task_name, new_name)
            doc = doc.replace(class_name, new_class_name)
        else:
            doc = doc.replace(class_name, new_class_name)
            doc = doc.replace(task_name, new_name)
    else:
        doc = doc.replace(task_name, new_name)
    """
    name_prefix = '"'
    name_prefix2 = "'"
    class_prefix = 'class '
    doc = doc.replace(name_prefix+task_name, name_prefix+new_name)
    doc = doc.replace(name_prefix2+task_name, name_prefix2+new_name)
    doc = doc.replace(class_prefix+class_name, class_prefix+new_class_name)
    #doc = doc.replace('\"path\": \"mteb/', '\"path\": \"mteb-pt/')
    if 'por-Latn' not in doc and 'por_Latn' not in doc:
        doc = doc.replace('eng-Latn', 'por-Latn')
        doc = doc.replace('eng_Latn', 'por_Latn')
        doc = doc.replace('\"eng', '\"por')
        doc = doc.replace('eng\"', 'por\"')
        doc = doc.replace('\"en\"', '\"pt\"')
        doc = doc.replace('\"en-', '\"pt-')
        doc = doc.replace('-en\"', '-pt\"')
    lines = doc.split('\n')
    for i, line in enumerate(lines):
        if '"revision": "' in line:
            lines[i] = " "*line.index('"') + '\"revision\": \"main\",'
        if '"path": "' in line:
            if "mteb/" in line:
                lines[i] = lines[i].replace("mteb/", "pt-mteb/translated_")
            else:
                s1 = line.split(':')
                if '/' in s1[-1]:
                    s2 = s1[-1].split('/')[-1]
                else:
                    s2 = s1[-1][s1[-1].find('"')+1:]
                path = ' "pt-mteb/translated_' + s2
                lines[i] = s1[0] + ':' + path

    with open(new_filepath, 'w') as f:
        f.write('\n'.join(lines))

    init_filepath = os.path.join(os.path.dirname(original_dir), '__init__.py')
    with open(init_filepath, 'r') as f:
        doc = f.read()
    import_line = f'from .por_translated.{new_class_name} import *'
    if import_line not in doc:
        doc = doc.strip()
        doc += '\n' + import_line + '\n'
        with open(init_filepath, 'w') as f:
            f.write(doc)
    

cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/AmazonPolarityClassification.py /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/por_translated/PTT_AmazonPolarityClassification.py
cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/Banking77Classification.py /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/por_translated/PTT_Banking77Classification.py
cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/EmotionClassification.py /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/por_translated/PTT_EmotionClassification.py
cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/ImdbClassification.py /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/por_translated/PTT_ImdbClassification.py
cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/ToxicConversationsClassification.py /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/por_translated/PTT_ToxicConversationsClassification.py
cp /mnt/e/ceia/mteb/mteb/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py /mnt/e/ce

In [53]:
new_filepath

'mteb/tasks/STS/eng/PTTradSickrSTS.py'

In [50]:
task_files

{'BibleNLPBitextMining': 'mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py',
 'BUCCBitextMining': 'mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py',
 'DiaBLaBitextMining': 'mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py',
 'FloresBitextMining': 'mteb/tasks/BitextMining/multilingual/FloresBitextMining.py',
 'IN22ConvBitextMining': 'mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py',
 'IN22GenBitextMining': 'mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py',
 'NorwegianCourtsBitextMining': 'mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py',
 'norwegian_courts_bitext_mining': 'mteb/tasks/BitextMining/multilingual/norwegian_courts_bitext_mining.py',
 'NTREXBitextMining': 'mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py',
 'RomaTalesBitextMining': 'mteb/tasks/BitextMining/multilingual/RomaTalesBitextMining.py',
 'TatoebaBitextMining': 'mteb/tasks/BitextMining/multilingual/TatoebaBitextMining.py',
 'AmazonCou

In [43]:
task_files

{'__init__': 'mteb/tasks/Summarization/fra/__init__.py',
 'BornholmskBitextMining': 'mteb/tasks/BitextMining/dan/BornholmskBitextMining.py',
 'BibleNLPBitextMining': 'mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py',
 'BUCCBitextMining': 'mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py',
 'DiaBLaBitextMining': 'mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py',
 'FloresBitextMining': 'mteb/tasks/BitextMining/multilingual/FloresBitextMining.py',
 'IN22ConvBitextMining': 'mteb/tasks/BitextMining/multilingual/IN22ConvBitextMining.py',
 'IN22GenBitextMining': 'mteb/tasks/BitextMining/multilingual/IN22GenBitextMining.py',
 'NorwegianCourtsBitextMining': 'mteb/tasks/BitextMining/multilingual/NorwegianCourtsBitextMining.py',
 'norwegian_courts_bitext_mining': 'mteb/tasks/BitextMining/multilingual/norwegian_courts_bitext_mining.py',
 'NTREXBitextMining': 'mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py',
 'RomaTalesBitextMining': 'mteb/tasks/BitextMini

In [75]:
import huggingface_hub
from tqdm.auto import tqdm
import os

size_data = {}

for dataset_info in tqdm(huggingface_hub.list_datasets(author="mteb-pt", full=False)):
    huggingface_hub.move_repo(dataset_info.id, f'mteb-pt/translated_{os.path.basename(dataset_info.id)}', repo_type='dataset')

0it [00:00, ?it/s]

In [78]:
from datasets import load_dataset
#dataset = load_dataset("eduagarcia/PortuLex_benchmark", name='rrip')
dataset.push_to_hub('eduagarcia/portuguese_benchmark', config_name='rrip')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/17.5k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/eduagarcia/portuguese_benchmark/commit/f28ff5f68ad7930e909ee82c212bcdc5427ccd91', commit_message='Upload dataset', commit_description='', oid='f28ff5f68ad7930e909ee82c212bcdc5427ccd91', pr_url=None, pr_revision=None, pr_num=None)

In [79]:
from portuguese_benchmark_disabled import PTBenchmark

In [92]:
citations = {}

for name, config in PTBenchmark.builder_configs.items():
    if config.citation not in citations:
        citations[config.citation] = [name]
    else:
        citations[config.citation].append(name)

for c in citations:
    print(f"# {', '.join(citations[c])}")
    print(c)
    print()

# LeNER-Br
@InProceedings{luz_etal_propor2018,
    author = {Pedro H. {Luz de Araujo} and Te'{o}filo E. {de Campos} and
            Renato R. R. {de Oliveira} and Matheus Stauffer and
            Samuel Couto and Paulo Bermejo},
    title = {{LeNER-Br}: a Dataset for Named Entity Recognition in {Brazilian} Legal Text},
    booktitle = {International Conference on the Computational Processing of Portuguese ({PROPOR})},
    publisher = {Springer},
    series = {Lecture Notes on Computer Science ({LNCS})},
    pages = {313--323},
    year = {2018},
    month = {September 24-26},
    address = {Canela, RS, Brazil},	  
    doi = {10.1007/978-3-319-99722-3_32},
    url = {https://teodecampos.github.io/LeNER-Br/},
}

# assin2-rte, assin2-sts
@inproceedings{real2020assin,
    title={The assin 2 shared task: a quick overview},
    author={Real, Livy and Fonseca, Erick and Oliveira, Hugo Goncalo},
    booktitle={International Conference on Computational Processing of the Portuguese Language},
  

In [102]:
#Table markdown tasks
tasks_types = {}
task_name_map = {
    'ner': 'NER',
    'classification': 'Classification',
    'sts': 'STS',
    'rte': 'NLI',
    'multilabel_classification': 'Classification'
}
for name, config in PTBenchmark.builder_configs.items():
    task_type = task_name_map[config.task_type]
    name_str = f"[{name}]({config.url})"
    #if config.url is None or config.url == '':
    #    name_str = name
    if task_type not in tasks_types:
        tasks_types[task_type] = [name_str]
    else:
        tasks_types[task_type].append(name_str)

print('| NER | Classification | NLI | STS |')
print('|---|---|---|---|')
max_len = max([len(tasks_types[t]) for t in tasks_types])
for i in range(max_len):
    row = []
    for task_type in ["NER", "Classification", "NLI", "STS"]:
        if i < len(tasks_types[task_type]):
            row.append(tasks_types[task_type][i])
        else:
            row.append('')
    print('| ' + ' | '.join(row) + ' |')






| NER | Classification | NLI | STS |
|---|---|---|---|
| [LeNER-Br](https://teodecampos.github.io/LeNER-Br/) | [HateBR_offensive_binary](https://github.com/franciellevargas/HateBR) | [assin2-rte](https://sites.google.com/view/assin2) | [assin2-sts](https://sites.google.com/view/assin2) |
| [UlyssesNER-Br-PL-coarse](https://github.com/ulysses-camara/ulysses-ner-br) | [HateBR_offensive_level](https://github.com/franciellevargas/HateBR) |  |  |
| [UlyssesNER-Br-C-coarse](https://github.com/ulysses-camara/ulysses-ner-br) | [brazilian_court_decisions_judgment](https://github.com/lagefreitas/predicting-brazilian-court-decisions) |  |  |
| [UlyssesNER-Br-PL-fine](https://github.com/ulysses-camara/ulysses-ner-br) | [brazilian_court_decisions_unanimity](https://github.com/lagefreitas/predicting-brazilian-court-decisions) |  |  |
| [UlyssesNER-Br-C-fine](https://github.com/ulysses-camara/ulysses-ner-br) | [multi_eurlex_pt](https://github.com/nlpaueb/MultiEURLEX/) |  |  |
| [harem-default](https:

In [96]:
tasks_types

{'ner': ['LeNER-Br',
  'UlyssesNER-Br-PL-coarse',
  'UlyssesNER-Br-C-coarse',
  'UlyssesNER-Br-PL-fine',
  'UlyssesNER-Br-C-fine',
  'harem-default',
  'harem-selective',
  'mapa_pt_coarse',
  'mapa_pt_fine'],
 'rte': ['assin2-rte'],
 'sts': ['assin2-sts'],
 'classification': ['HateBR_offensive_binary',
  'HateBR_offensive_level',
  'brazilian_court_decisions_judgment',
  'brazilian_court_decisions_unanimity',
  'Portuguese_Hate_Speech_binary'],
 'multilabel_classification': ['multi_eurlex_pt']}

In [89]:
for name, config in PTBenchmark.builder_configs.items():
    print(f"**{name}** ({config.task_type.replace('_', ' ').upper()}) [\[Link\]]({config.url})")
    print()
    print(config.description)
    print()

**LeNER-Br** (NER) [\[Link\]](https://teodecampos.github.io/LeNER-Br/)

LeNER-Br is a Portuguese language dataset for named entity recognition applied to legal documents. 
LeNER-Br consists entirely of manually annotated legislation and legal cases texts and contains tags 
for persons, locations, time entities, organizations, legislation and legal cases. To compose the dataset, 
66 legal documents from several Brazilian Courts were collected. Courts of superior and state levels were considered, 
such as Supremo Tribunal Federal, Superior Tribunal de Justiça, Tribunal de Justiça de Minas Gerais and Tribunal de Contas da União. 
In addition, four legislation documents were collected, such as "Lei Maria da Penha", giving a total of 70 documents.

**assin2-rte** (RTE) [\[Link\]](https://sites.google.com/view/assin2)

The ASSIN 2 corpus is composed of rather simple sentences. Following the procedures of SemEval 2014 Task 1.
The training and validation data are composed, respectively, of 6,5

In [6]:


analysis_dataset = "mteb-pt/pira2_retrieval"
config_name="corpus"
text_columns = ["text"]

n_samples={}
avg_character_length={}

from datasets import load_dataset
dataset = load_dataset(analysis_dataset, name=config_name)

for split in dataset.keys():
    n_samples[split] = len(dataset[split])
    caracter_lengths = []
    for d in range(len(dataset[split])):
        for col in text_columns:
            texts = dataset[split][d][col]
            if isinstance(texts, str):
                texts = [texts]
            for text in texts:
                caracter_lengths.append(len(text))
    avg_character_length[split] = round(sum(caracter_lengths)/len(caracter_lengths),1)

print(f"n_samples={n_samples},")
print(f"avg_character_length={avg_character_length}")


Downloading data:   0%|          | 0.00/718k [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/702 [00:00<?, ? examples/s]

n_samples={'corpus': 702},
avg_character_length={'corpus': 1745.8}
