# Getting all model JSONs

Here we attempt to scrape the models API for the fine-tuning relationship for all models. To do so, we use the model API. 

The following link: https://huggingface.co/api/models
provides a list of json values with the following format:
[{
    "_id": "687060f05721fba56ca177a8",
    "id": "moonshotai/Kimi-K2-Instruct",
    "likes": 472,
    "trendingScore": 472,
    "private": false,
    "downloads": 13356,
    "tags": [
      "transformers",
      "safetensors",
      "kimi_k2",
      "text-generation",
      "conversational",
      "custom_code",
      "doi:10.57967/hf/5976",
      "license:other",
      "autotrain_compatible",
      "endpoints_compatible",
      "fp8",
      "region:us"
    ],
    "pipeline_tag": "text-generation",
    "library_name": "transformers",
    "createdAt": "2025-07-11T00:55:12.000Z",
    "modelId": "moonshotai/Kimi-K2-Instruct"
  },
  {
    "_id": "685ffb0a9c4d599d2a98bc2c",
    "id": "THUDM/GLM-4.1V-9B-Thinking",
    "likes": 568,
    "trendingScore": 368,
    "private": false,
    "downloads": 33839,
    "tags": [
      "transformers",
      "safetensors",
      "glm4v",
      "image-text-to-text",
      "reasoning",
      "conversational",
      "en",
      "zh",
      "arxiv:2507.01006",
      "base_model:THUDM/GLM-4-9B-0414",
      "base_model:finetune:THUDM/GLM-4-9B-0414",
      "license:mit",
      "endpoints_compatible",
      "region:us"
    ],
    "pipeline_tag": "image-text-to-text",
    "library_name": "transformers",
    "createdAt": "2025-06-28T14:24:10.000Z",
    "modelId": "THUDM/GLM-4.1V-9B-Thinking"
  }]

  We'd like to use pagination to get all such models in the hub, and simply keep information on "modelId" and the full Json associated with that model, for all models.

In [None]:
import requests
import pandas as pd

def fetch_all_models():
    #url = "https://huggingface.co/api/models"
    url = "https://huggingface.co/api/models?sort=trendingScore&cursor=eyIkb3IiOlt7InRyZW5kaW5nU2NvcmUiOjAsIl9pZCI6eyIkZ3QiOiI2NzAyYTc0MDIzZGY1YTdiZDgxZDcxYzIifX0seyJ0cmVuZGluZ1Njb3JlIjp7IiRsdCI6MH19LHsidHJlbmRpbmdTY29yZSI6bnVsbH1dfQ%3D%3D"
    models = []
    urls = []
    #while True:
    for i in range(1000):
        print(i)
        response = requests.get(url)
        if response.status_code != 200:
            break
        data = response.json()
        if not data:
            break
        for model in data:
            models.append({"modelId": model["modelId"], "fullJson": model})
        
        # Check for the next page link in the response headers
        next_page_link = response.headers.get('Link')
        if not next_page_link or 'rel="next"' not in next_page_link:
            break
        
        # Extract the URL for the next page
        url = next_page_link.split(';')[0].strip('<>')
        urls.append(url)

        #page += 1
    return models, urls

def create_model_dataset():
    models, urls = fetch_all_models()
    df = pd.DataFrame(models)
    return df, urls

# Create the dataset
model_dataset_2, urls_2 = create_model_dataset()

model_dataset_2.head()


In [None]:
model_dataset_2.to_csv("data/ai_ecosystem_jsons_dataset.csv")