In [1]:
from dotenv import dotenv_values
import yaml
import pandas as pd
import re
import pyarrow as pa

hf_token = dotenv_values(".env")['HF_TOKEN']

In [2]:
from huggingface_hub import HfApi, list_models, ModelCard, repo_exists, utils

HF_HUB_DISABLE_PROGRESS_BARS=1
# Configure a HfApi client
hf_api = HfApi(
    endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
    token=hf_token, # Token is not persisted on the machine.
)


In [3]:
BASE_MODEL_PATTERN = re.compile(r"fine-tuned version of \[(.*?)\]")

class BaseModelCache:
    cache = {}

    @staticmethod
    def load(model_name):
        if model_name not in BaseModelCache.cache:
            try:
                BaseModelCache.cache[model_name] = ModelCard.load(model_name, ignore_metadata_errors=True)
            except utils.RepositoryNotFoundError:
                print("Repository not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
            except utils.EntryNotFoundError:
                print("README not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
        return BaseModelCache.cache[model_name]

def get_base_model_dataset(model_name):
    """Retrieve dataset information for a base model."""
    if model_name:
        info = BaseModelCache.load(model_name)
        if info:
            return info.data.get("datasets")
    return None

def extract_base_model(name, card_data):
    """Extract base model info"""
    base_model = card_data.get("base_model")
    if not base_model:
        info = BaseModelCache.load(name)
        match = BASE_MODEL_PATTERN.search(info.text)
        base_model = match.group(1) if match else None
    return base_model

def extract_base_model_datasets(name, card_data):
    """Extract datasets for the base model."""
    base_model = extract_base_model(name, card_data)
    datasets = get_base_model_dataset(base_model)
    print(f"Base model datasets for {name}: {datasets}")
    return datasets

def extract_accuracy(results):
    for result in results:
        for metric in result.get("metrics", []):
            if metric.get("type") == "accuracy":
                accuracy_value = metric.get("value")
                if isinstance(accuracy_value, list) and accuracy_value:
                    return float(accuracy_value[0])
                elif isinstance(accuracy_value, (float, int)):
                    return float(accuracy_value)
    return 0.0

def extract_dataset(name, card_data, results):
    for result in results:
        dataset = result.get("dataset")
        current_dataset = dataset.get("name")
        if current_dataset in ["imagefolder", "image_folder"]:
            datasets = extract_base_model_datasets(name, card_data) or []
            return [current_dataset, *datasets]
        else:
            return [current_dataset]
    return None


def is_valid_card_data(model, card_data):
    name = model.id
    necessary_keys = ["task", "dataset", "metrics"]
    model_index = card_data.get("model-index", [])
    if not model_index:
        return False, None, None, None

    for entry in model_index:
        results = entry.get("results", [])
        if not results:
            return False, None, None, None
        for result in results:
            if not all(key in result for key in necessary_keys):
                return False, None, None, None
    return True, extract_accuracy(results), extract_dataset(name, card_data, results), extract_base_model(name, card_data)


In [4]:
models = hf_api.list_models(task="object-detection", cardData=True,)

processed_models = []
accuracy_values = []
datasets = []
base_models = []
likes = []
downloads = []

for model in models:
    if model.card_data:
        card_data = yaml.safe_load(str(model.card_data))
        is_valid, accuracy, dataset, base_model = is_valid_card_data(model, card_data)
        if is_valid:
            model.card_data = card_data
            processed_models.append(model.id)
            accuracy_values.append(accuracy or 0)
            datasets.append(dataset or None)
            base_models.append(base_model)
            likes.append(model.likes)
            downloads.append(model.downloads)

df = pd.DataFrame({'model': processed_models, 'accuracy': accuracy_values, 'dataset': datasets, 'base_model': base_models, 'likes': likes, 'downloads': downloads})
df = df.sort_values(by='accuracy', ascending=False)
df

README.md:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Unnamed: 0,model,accuracy,dataset,base_model,likes,downloads
3,Narsil/layoutlmv3-finetuned-funsd,0.833027,[nielsr/funsd-layoutlmv3],microsoft/layoutlmv3-base,0,621
61,keremberke/yolov8s-protective-equipment-detection,0.000000,[protective-equipment-detection],,3,2920
60,keremberke/yolov8n-protective-equipment-detection,0.000000,[protective-equipment-detection],,0,2910
59,keremberke/yolov8m-hard-hat-detection,0.000000,[hard-hat-detection],,8,4740
58,keremberke/yolov8s-hard-hat-detection,0.000000,[hard-hat-detection],,1,3181
...,...,...,...,...,...,...
25,keremberke/yolov5n-license-plate,0.000000,[keremberke/license-plate-object-detection],,17,34979
24,keremberke/yolov5m-blood-cell,0.000000,[keremberke/blood-cell-object-detection],,2,1087
23,keremberke/yolov5s-blood-cell,0.000000,[keremberke/blood-cell-object-detection],,2,673
22,keremberke/yolov5n-blood-cell,0.000000,[keremberke/blood-cell-object-detection],,5,575


In [5]:
df.to_csv("obj_detection_models.csv")
df.to_pickle("obj_detection_models.pkl")

In [28]:
df_explode = df.explode("dataset")
df_explode

Unnamed: 0,model,accuracy,dataset
2939,ModelInfo(id='ozzyonfire/bird-species-classifi...,96.8,Bird Species
1253,ModelInfo(id='chriamue/bird-species-classifier...,96.8,Bird Species
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagefolder
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet-21k
...,...,...,...
3009,ModelInfo(id='sbottazziunsam/10-classifier-fin...,0.0,imagenet-1k
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,imagefolder
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,chest X-rays
2742,ModelInfo(id='debajyotidasgupta/convnextv2-bas...,0.0,imagefolder


In [4]:
grouped_dfs = {name: group.drop(columns='dataset') for name, group in df_explode.groupby('dataset')}

count = 0
for dataset_name, data in grouped_dfs.items():
    count +=1
    print(f"Dataset: {dataset_name}")

grouped_dfs["cats_vs_dogs"]["model"]

NameError: name 'df_explode' is not defined

In [18]:
cheese = hf_api.list_models(author="hf-internal-testing", cardData=True)
cheese = [model for model in cheese]
cheese

[]

In [None]:
info = ModelCard.load("hf-internal-testing/tiny-random-vit")
info