In [2]:
from dotenv import dotenv_values
import yaml
import pandas as pd
import re
import pyarrow as pa

hf_token = dotenv_values(".env")['HF_TOKEN']

In [14]:
from huggingface_hub import HfApi, list_models, ModelCard, repo_exists, utils

HF_HUB_DISABLE_PROGRESS_BARS=1
# Configure a HfApi client
hf_api = HfApi(
    endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
    token=hf_token, # Token is not persisted on the machine.
)


In [19]:
BASE_MODEL_PATTERN = re.compile(r"fine-tuned version of \[(.*?)\]")

class BaseModelCache:
    cache = {}

    @staticmethod
    def load(model_name):
        if model_name not in BaseModelCache.cache:
            try:
                BaseModelCache.cache[model_name] = ModelCard.load(model_name, ignore_metadata_errors=True)
            except utils.RepositoryNotFoundError:
                print("Repository not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
            except utils.EntryNotFoundError:
                print("README not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
        return BaseModelCache.cache[model_name]

def get_base_model_dataset(model_name):
    """Retrieve dataset information for a base model."""
    if model_name:
        info = BaseModelCache.load(model_name)
        if info:
            return info.data.get("datasets")
    return None

def extract_base_model_datasets(name, card_data):
    """Extract datasets for the base model."""
    base_model = card_data.get("base_model")
    if not base_model:
        info = BaseModelCache.load(name)
        match = BASE_MODEL_PATTERN.search(info.text)
        base_model = match.group(1) if match else None

    datasets = get_base_model_dataset(base_model)
    return datasets


def extract_accuracy(results):
    for result in results:
        for metric in result.get("metrics", []):
            if metric.get("type") == "accuracy":
                accuracy_value = metric.get("value")
                if isinstance(accuracy_value, list) and accuracy_value:
                    return float(accuracy_value[0])
                elif isinstance(accuracy_value, (float, int)):
                    return float(accuracy_value)
    return 0.0

def extract_dataset(name, card_data, results):
    for result in results:
        dataset = result.get("dataset")
        current_dataset = dataset.get("name")
        if current_dataset in ["imagefolder", "image_folder"]:
            datasets = extract_base_model_datasets(name, card_data) or []
            return [current_dataset, *datasets]
        else:
            return [current_dataset]
    return None


def is_valid_card_data(name, card_data):
    necessary_keys = ["task", "dataset", "metrics"]
    model_index = card_data.get("model-index", [])
    if not model_index:
        return False, None, None

    for entry in model_index:
        results = entry.get("results", [])
        if not results:
            return False, None, None
        for result in results:
            if not all(key in result for key in necessary_keys):
                return False, None, None
    return True, extract_accuracy(results), extract_dataset(name, card_data, results)


In [35]:
models = hf_api.list_models(task="automatic-speech-recognition", cardData=True,)

processed_models = []
accuracy_values = []
datasets = []
for model in models:
    if model.card_data:
        card_data = yaml.safe_load(str(model.card_data))
        is_valid, accuracy, dataset = is_valid_card_data(model.id, card_data)
        if is_valid:
            model.card_data = card_data
            processed_models.append(model)
            accuracy_values.append(accuracy or 0)
            datasets.append(dataset or None)

df = pd.DataFrame({'model': processed_models, 'accuracy': accuracy_values, 'dataset': datasets})
df = df.sort_values(by='accuracy', ascending=False)

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not

In [36]:
df.to_csv("speech_models.csv")
df.to_pickle("speech_models.pkl")

In [28]:
df_explode = df.explode("dataset")
df_explode

Unnamed: 0,model,accuracy,dataset
2939,ModelInfo(id='ozzyonfire/bird-species-classifi...,96.8,Bird Species
1253,ModelInfo(id='chriamue/bird-species-classifier...,96.8,Bird Species
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagefolder
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet-21k
...,...,...,...
3009,ModelInfo(id='sbottazziunsam/10-classifier-fin...,0.0,imagenet-1k
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,imagefolder
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,chest X-rays
2742,ModelInfo(id='debajyotidasgupta/convnextv2-bas...,0.0,imagefolder


In [32]:
grouped_dfs = {name: group.drop(columns='dataset') for name, group in df_explode.groupby('dataset')}

count = 0
for dataset_name, data in grouped_dfs.items():
    count +=1
    print(f"Dataset: {dataset_name}")
count

Dataset: ./data/games-ad-0306
Dataset: ./mgr/dataset/HF_DS
Dataset: 1aurent/Kather-texture-2016
Dataset: 1aurent/LC25000
Dataset: Beans
Dataset: Bird Species
Dataset: CIFAR-10
Dataset: CIFAR-100
Dataset: CIFAR100
Dataset: Camelyon16[Meta]
Dataset: CelebA-faces
Dataset: Cifar10
Dataset: Cifar100
Dataset: Dataset_points_durs_v1
Dataset: Dog Food
Dataset: Falah/Alzheimer_MRI
Dataset: FastJobs/Visual_Emotional_Analysis
Dataset: HumanEval
Dataset: Human_Action_Recognition
Dataset: Indian-Food-Images
Dataset: JLB-JLB/seizure_eeg_greyscale_224x224_6secWindow
Dataset: KTH-TIPS2-b
Dataset: MNIST
Dataset: Matthijs/snacks
Dataset: New Plant Diseases Dataset
Dataset: RiniPL/Dementia_Dataset
Dataset: SVHN
Dataset: TCGA-BRCA
Dataset: action_class
Dataset: agent_action_class
Dataset: amazonian_fish_classifier_data
Dataset: arabic-handwritten-characters
Dataset: bazyl/GTSRB
Dataset: beans
Dataset: bird-data
Dataset: bird_species_dataset
Dataset: blurry images
Dataset: brain-tumor-collection
Dataset: b

111

In [18]:
cheese = hf_api.list_models(author="hf-internal-testing", cardData=True)
cheese = [model for model in cheese]
cheese

[]

In [None]:
info = ModelCard.load("hf-internal-testing/tiny-random-vit")
info