In [2]:
from dotenv import dotenv_values
import yaml
import pandas as pd
import re
import pyarrow as pa

hf_token = dotenv_values(".env")['HF_TOKEN']

In [3]:
from huggingface_hub import HfApi, list_models, ModelCard, repo_exists, utils

HF_HUB_DISABLE_PROGRESS_BARS=1
# Configure a HfApi client
hf_api = HfApi(
    endpoint="https://huggingface.co", # Can be a Private Hub endpoint.
    token=hf_token, # Token is not persisted on the machine.
)


In [17]:
BASE_MODEL_PATTERN = re.compile(r"fine-tuned version of \[(.*?)\]")

class BaseModelCache:
    cache = {}

    @staticmethod
    def load(model_name):
        if model_name not in BaseModelCache.cache:
            try:
                BaseModelCache.cache[model_name] = ModelCard.load(model_name, ignore_metadata_errors=True)
            except utils.RepositoryNotFoundError:
                print("Repository not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
            except utils.EntryNotFoundError:
                print("README not found for: " + model_name)
                BaseModelCache.cache[model_name] = None
        return BaseModelCache.cache[model_name]

def get_base_model_dataset(model_name):
    """Retrieve dataset information for a base model."""
    if model_name:
        info = BaseModelCache.load(model_name)
        if info:
            return info.data.get("datasets")
    return None

def extract_base_model(name, card_data):
    """Extract base model info"""
    base_model = card_data.get("base_model")
    if not base_model:
        info = BaseModelCache.load(name)
        match = BASE_MODEL_PATTERN.search(info.text)
        base_model = match.group(1) if match else None
    return base_model

def extract_base_model_datasets(name, card_data):
    """Extract datasets for the base model."""
    base_model = extract_base_model(name, card_data)
    datasets = get_base_model_dataset(base_model)
    print(f"Base model datasets for {name}: {datasets}")
    return datasets

def extract_accuracy(results):
    for result in results:
        for metric in result.get("metrics", []):
            if metric.get("type") == "accuracy":
                accuracy_value = metric.get("value")
                if isinstance(accuracy_value, list) and accuracy_value:
                    return float(accuracy_value[0])
                elif isinstance(accuracy_value, (float, int)):
                    return float(accuracy_value)
    return 0.0

def extract_dataset(name, card_data, results):
    for result in results:
        dataset = result.get("dataset")
        current_dataset = dataset.get("name")
        if current_dataset in ["imagefolder", "image_folder"]:
            datasets = extract_base_model_datasets(name, card_data) or []
            return [current_dataset, *datasets]
        else:
            return [current_dataset]
    return None


def is_valid_card_data(model, card_data):
    name = model.id
    necessary_keys = ["task", "dataset", "metrics"]
    model_index = card_data.get("model-index", [])
    if not model_index:
        return False, None, None, None

    for entry in model_index:
        results = entry.get("results", [])
        if not results:
            return False, None, None, None
        for result in results:
            if not all(key in result for key in necessary_keys):
                return False, None, None, None
    return True, extract_accuracy(results), extract_dataset(name, card_data, results), extract_base_model(name, card_data)


In [18]:
models = hf_api.list_models(trained_dataset="cats_vs_dogs", cardData=True,)

processed_models = []
accuracy_values = []
datasets = []
base_models = []
likes = []
downloads = []

for model in models:
    if model.card_data:
        card_data = yaml.safe_load(str(model.card_data))
        is_valid, accuracy, dataset, base_model = is_valid_card_data(model, card_data)
        if is_valid:
            model.card_data = card_data
            processed_models.append(model.id)
            accuracy_values.append(accuracy or 0)
            datasets.append(dataset or None)
            base_models.append(base_model)
            likes.append(model.likes)
            downloads.append(model.downloads)

df = pd.DataFrame({'model': processed_models, 'accuracy': accuracy_values, 'dataset': datasets, 'base_model': base_models, 'likes': likes, 'downloads': downloads})
df = df.sort_values(by='accuracy', ascending=False)
df

Unnamed: 0,model,accuracy,dataset,base_model,likes,downloads
6,danieltur/my_awesome_catdog_model,1.0,[cats_vs_dogs],google/vit-base-patch16-224-in21k,0,12
4,ChasingMercer/beit-base,0.997651,[cats_vs_dogs],microsoft/beit-base-patch16-224-pt22k-ft22k,0,21
3,efederici/convnext-base-224-22k-1k-orig-cats-v...,0.997333,[cats_vs_dogs],facebook/convnext-base-224-22k-1k,0,56
8,Camilosan/Modelo-catsVSdogs,0.995,[cats_vs_dogs],google/vit-base-patch16-224-in21k,0,1
1,ismgar01/vit-base-cats-vs-dogs,0.993736,[cats_vs_dogs],google/vit-base-patch16-224-in21k,0,48
2,nateraw/vit-base-cats-vs-dogs,0.993451,[cats_vs_dogs],google/vit-base-patch16-224-in21k,1,21
12,cppgohan/resnet-50-finetuned-dog-vs-cat,0.991884,[cats_vs_dogs],microsoft/resnet-50,0,16
5,tangocrazyguy/resnet-50-finetuned-cats_vs_dogs,0.989321,[cats_vs_dogs],microsoft/resnet-50,0,2
0,akahana/vit-base-cats-vs-dogs,0.988326,[cats_vs_dogs],google/vit-base-patch16-224-in21k,0,325
11,Amadeus99/cat_vs_dog_classifier,0.985761,[cats_vs_dogs],google/vit-base-patch16-224-in21k,0,3


In [5]:
df.to_csv("cats_vs_dogs.csv")
df.to_pickle("cats_vs_dogs.pkl")

In [28]:
df_explode = df.explode("dataset")
df_explode

Unnamed: 0,model,accuracy,dataset
2939,ModelInfo(id='ozzyonfire/bird-species-classifi...,96.8,Bird Species
1253,ModelInfo(id='chriamue/bird-species-classifier...,96.8,Bird Species
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagefolder
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet
2639,"ModelInfo(id='alirzb/S1_M1_R1_beit_42507336', ...",1.0,imagenet-21k
...,...,...,...
3009,ModelInfo(id='sbottazziunsam/10-classifier-fin...,0.0,imagenet-1k
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,imagefolder
3007,ModelInfo(id='sbottazziunsam/9-classifier-fine...,0.0,chest X-rays
2742,ModelInfo(id='debajyotidasgupta/convnextv2-bas...,0.0,imagefolder


In [4]:
grouped_dfs = {name: group.drop(columns='dataset') for name, group in df_explode.groupby('dataset')}

count = 0
for dataset_name, data in grouped_dfs.items():
    count +=1
    print(f"Dataset: {dataset_name}")

grouped_dfs["cats_vs_dogs"]["model"]

NameError: name 'df_explode' is not defined

In [18]:
cheese = hf_api.list_models(author="hf-internal-testing", cardData=True)
cheese = [model for model in cheese]
cheese

[]

In [None]:
info = ModelCard.load("hf-internal-testing/tiny-random-vit")
info