In [1]:
import pandas as pd
import json
import ast
import re


# Load Metadata

In [2]:
modeldf = pd.read_csv("hf_models_20250909_113720.csv")
dsdf = pd.read_csv("hf_datasets_20250909_114159.csv")
model_downloads = pd.read_csv("hf_model_downloads_20250909.csv")
dataset_downloads = pd.read_csv("hf_dataset_downloads_20250909.csv")
model_results_filename = "hf_model_metadata_20250909_153100.jsonl"
dataset_results_filename = "hf_dataset_metadata_20250909_152434.jsonl"

def get_json_records(filename):
    records = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            repo = obj["id"]
            meta = obj["metadata"]
    
            if meta == "ERROR":
                # create a dict with just error marker
                records.append({"repo": repo, "ERROR": True})
            else:
                # unpack metadata dict and add repo
                row = {"repo": repo}
                row.update(meta)
                records.append(row)
    return records

mdf = pd.DataFrame(get_json_records(model_results_filename))
ddf = pd.DataFrame(get_json_records(dataset_results_filename))

mdf['total_downloads'] = mdf['repo'].map(dict(zip(model_downloads['model_id'], model_downloads['total_downloads'])))
ddf['total_downloads'] = ddf['repo'].map(dict(zip(dataset_downloads['model_id'], dataset_downloads['total_downloads'])))

# if no error then there will be no ERROR columns
print(len(mdf), 'ERROR' in mdf.columns)
print(len(ddf), 'ERROR' in ddf.columns)

# if downloads weren't available they will be 0
print(len(mdf[mdf['total_downloads'] == 0]))
print(len(ddf[ddf['total_downloads'] == 0]))

# column names for model and dataset metadata
print(mdf.columns)
print(ddf.columns)


8810 False
1420 False
862
0
Index(['repo', 'id', 'author', 'sha', 'last_modified', 'created_at', 'private',
       'gated', 'disabled', 'downloads', 'downloads_all_time', 'likes',
       'library_name', 'gguf', 'inference', 'inference_provider_mapping',
       'tags', 'pipeline_tag', 'mask_token', 'trending_score', 'card_data',
       'widget_data', 'model_index', 'config', 'transformers_info', 'siblings',
       'spaces', 'safetensors', 'security_repo_status', 'xet_enabled',
       'lastModified', 'cardData', 'transformersInfo', '_id', 'modelId',
       'usedStorage', 'total_downloads'],
      dtype='object')
Index(['repo', 'id', 'author', 'sha', 'created_at', 'last_modified', 'private',
       'gated', 'disabled', 'downloads', 'downloads_all_time', 'likes',
       'paperswithcode_id', 'tags', 'trending_score', 'card_data', 'siblings',
       'xet_enabled', 'lastModified', 'cardData', '_id', 'usedStorage',
       'description', 'citation', 'total_downloads'],
      dtype='object')


# Filtering

In [3]:
# --- helpers ---------------------------------------------------------------
def _to_lower_list(x):
    if isinstance(x, (list, tuple, set)):
        seq = x
    elif isinstance(x, np.ndarray):
        seq = x.tolist()
    else:
        return []
    return [str(t).lower() for t in seq if pd.notna(t)]
    
def _safe_str(x):
    return str(x).lower() if pd.notna(x) else ""

# Pipelines we do NOT want
EXCLUDE_PIPELINES = {
    # encoders / classifiers / embeddings
    "text-embedding", "feature-extraction", "sentence-similarity",
    "fill-mask", "token-classification", "sequence-classification",
    # vision
    "image-classification", "object-detection", "image-segmentation",
    "image-to-image", "depth-estimation", "image-feature-extraction", # "image-to-text"
    # audio
    "audio-classification", "text-to-speech", # "automatic-speech-recognition",
    "speech-segmentation", "voice-activity-detection",
    # diffusion / generative imaging
    "text-to-image", "diffusers",
    # seq2seq (exclude T5-like)
    "text2text-generation",
}

# Name/tag tokens we do NOT want (broad but conservative)
EXCLUDE_NAME_TOKENS = {
    # encoders / embedding families
    "bert", "roberta", "mpnet", "minilm", "e5", "bge", "gte", "sbert", "sentence-transformers",
    # t5 & friends
    "t5",
    # speech/asr/tts
    "whisper", "wav2vec", "hubert", "tacotron", "fastspeech", "tts", "vits",
    # diffusion / imaging
    "stable-diffusion", "sdxl", "sd-", "latent-diffusion", "controlnet", "unet", "vae", "inpaint",
    "txt2img", "img2img", "diffusion",
}

# Architectures we do NOT want (from config/architectures)
EXCLUDE_ARCH_TOKENS = {
    "bert", "roberta", "t5", "whisper", "wav2vec", "hubert",
    # "clip", "vision", "unet", "vae",
}

# Positive signals for causal LMs and GGUF (known to be LLM format)
INCLUDE_PIPELINES = {"text-generation"}
INCLUDE_TAGS = {"text-generation", "causal-lm"}
INCLUDE_NAME_TOKENS = {"gguf"}

def is_decoder_only_llm(row) -> bool:
    name = _safe_str(row.get("repo") or row.get("modelId") or row.get("id"))
    pipeline = _safe_str(row.get("pipeline_tag"))
    tags = set(_to_lower_list(row.get("tags")))
    cfg_str = _safe_str(row.get("config"))
    arch_str = _safe_str(row.get("transformers_info") or row.get("transformersInfo"))

    # 1) Hard exclude by pipeline
    if pipeline in EXCLUDE_PIPELINES:
        return False

    # 2) Positive signals: pipeline/tag says it's a causal LM
    if pipeline in INCLUDE_PIPELINES:
        return True

    # 3) Hard include by name tokens or tags ("strong" list that applies first)
    tags_hit = any(tag in tags for tag in INCLUDE_TAGS)
    tokens_in_name = any(tok in name for tok in INCLUDE_NAME_TOKENS)
    tokens_in_tags = any(tok in tags for tok in INCLUDE_NAME_TOKENS)
    if tokens_in_name or tokens_in_tags:
        return True
        
    # 4) Hard exclude by name tokens or tags
    tokens_in_name = any(tok in name for tok in EXCLUDE_NAME_TOKENS)
    tokens_in_tags = any(tok in tags for tok in EXCLUDE_NAME_TOKENS)
    if tokens_in_name or tokens_in_tags:
        return False

    # 5) Hard exclude by architectures in config/transformers info
    arch_blob = f"{cfg_str} {arch_str}"
    if any(tok in arch_blob for tok in EXCLUDE_ARCH_TOKENS):
        return False

    # 6) Otherwise, include (change to False to be more conservative)
    return True

# --- apply filter -----------------------------------------------------------
print(len(mdf))
mdf = mdf.drop_duplicates(subset=["repo"]).reset_index(drop=True)
print(len(mdf))

llmdf = mdf[mdf.apply(is_decoder_only_llm, axis=1)].reset_index(drop=True)
print(len(llmdf))

# # if need to examine the excluded repos
# excluded = mdf[~mdf['repo'].isin(llmdf['repo'])][['id', 'tags', 'pipeline_tag', 'config', 'transformers_info', 'transformersInfo']]


8810
8810
8608


# Extract Metadata Components

In [4]:
llmdf["tags"] = llmdf["tags"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
llmdf["config"] = llmdf["config"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
llmdf["transformers_info"] = llmdf["transformers_info"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
llmdf["transformersInfo"] = llmdf["transformersInfo"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


def f(x):
    if isinstance(x, dict):
        if 'architectures' in x:
            return x['architectures'][0]
    return None
llmdf['architectures'] = llmdf['config'].apply(f)

def f(x):
    if isinstance(x, dict):
        if 'model_type' in x:
            return x['model_type']
    return None
llmdf['model_type_extracted'] = llmdf['config'].apply(f)

def f(x):
    if isinstance(x, dict):
        if 'parameters' in x:
            if 'total' in x['parameters']:
                return x['parameters']
            else:
                return max(x['parameters'].values(), default=None)
    return None
llmdf['parameters'] = llmdf['safetensors'].apply(f)

model_type_list = llmdf['model_type_extracted'].value_counts().index

def f(x):
    repo, tags, model_type_extracted = x
    if model_type_extracted:
        return model_type_extracted
    for model_type in model_type_list:
        repo_hit = model_type.lower() in repo.lower()
        tags_hit = any(model_type.lower() in tag.lower() for tag in tags)
        if repo_hit or tags_hit:
            return model_type
    return None
llmdf['model_type'] = llmdf[['repo', 'tags', 'model_type_extracted']].apply(f, axis=1)

def f(x):
    repo, tags = x
    repo_hit = 'gguf' in repo.lower()
    tags_hit = any('gguf' in tag.lower() for tag in tags)
    return (repo_hit or tags_hit)
llmdf['gguf'] = llmdf[['repo', 'tags']].apply(f, axis=1)

def f(x):
    repo, tags = x
    repo_hit = 'merge' in repo.lower()
    tags_hit = any('merge' in tag.lower() for tag in tags)
    return (repo_hit or tags_hit)
llmdf['merge_model'] = llmdf[['repo', 'tags']].apply(f, axis=1)


## Extract Quantization Information

In [5]:
# QUANT_TOKENS = {
#     "gptq", "awq", "exl2", "marlin", "spqr", "imatrix",
#     "nf4", "fp4", "fp8", "fp16", "bf16", "fp32",
#     "int4", "int8", "4-bit", "8-bit"  # quick string hits
# }

# QUANT_REGEX = re.compile(
#     r"(?i)(?<![a-z0-9])("
#     r"q[234568](?:_[01])?"
#     r"|q[234568]_k(?:_[sml])?"
#     r"|iq[234568](?:_[a-z]{1,3})?"
#     r"|q[45]f16(?:_[01])?"
#     r"|int[2345678]"
#     r"|[2345678]\s*[-_ ]?\s*bits?"
#     r"|w[0-9]+a[0-9]+(?:g[0-9]+)?"
#     r"|nf4|fp4|fp8|fp16|bf16|fp32"
#     r"|gptq|awq|exl2|marlin|spqr|imatrix"
#     r"|e[0-9]+b"    # E4B etc.
#     r"|bpw[0-9]+(?:\.[0-9]+)?"   # bpw8, bpw4.0, bpw3.5
#     r"|[0-9]+(?:\.[0-9]+)?bpw"   # 5.0bpw, 4bpw
#     r")(?![a-z0-9])"
# )

# def has_quant_signal(text: str) -> bool:
#     if not text:
#         return False
#     s = text.lower()
#     # quick pass for common tokens
#     if any(tok in s for tok in QUANT_TOKENS):
#         return True
#     # structured patterns
#     return bool(QUANT_REGEX.search(s))

# def collect_quant_hits(*texts):
#     """Return sorted list of all quant tokens/regex matches found in given texts."""
#     hits = set()
#     for t in texts:
#         if not t:
#             continue
#         if isinstance(t, (list, tuple, set)):
#             # scan each element if it's a list like tags
#             for x in t:
#                 s = str(x).lower()
#                 hits.update(tok for tok in QUANT_TOKENS if tok in s)
#                 hits.update(m.group(0).lower() for m in QUANT_REGEX.finditer(s))
#         else:
#             s = str(t).lower()
#             hits.update(tok for tok in QUANT_TOKENS if tok in s)
#             hits.update(m.group(0).lower() for m in QUANT_REGEX.finditer(s))
#     return sorted(hits)

# METHOD_TOKENS = {"gptq", "awq", "exl2", "marlin", "spqr", "imatrix"}  # add "gguf" if you want it as a type

# def _token_bits(tok: str):
#     s = str(tok).lower().strip()

#     # weights-activations patterns like w8a8, w4a16, w8a8g128
#     if s.startswith("w") and "a" in s:
#         i = 1
#         w = ""
#         while i < len(s) and s[i].isdigit():
#             w += s[i]; i += 1
#         k = s.find("a", i)
#         if k != -1:
#             k += 1
#             a = ""
#             while k < len(s) and s[k].isdigit():
#                 a += s[k]; k += 1
#             bits = [int(x) for x in (w, a) if x]
#             if bits:
#                 return min(bits)  # choose the smaller of W/A as the quant level

#     # bits-per-weight
#     if s.startswith("bpw"):
#         num = "".join(ch for ch in s[3:] if (ch.isdigit() or ch == "."))
#         if num:
#             try:
#                 return int(float(num))  # bpw8.0 → 8, bpw3.5 → 3
#             except ValueError:
#                 pass

#     # bits-per-weight (suffix): 5.0bpw, 4bpw
#     if s.endswith("bpw"):
#         num = "".join(ch for ch in s[:-3] if (ch.isdigit() or ch == "."))
#         if num:
#             try:
#                 return int(float(num))  # 5.0bpw → 5
#             except ValueError:
#                 pass
    
#     # direct precision / bitwidth tokens
#     if s in {"fp32"}: return 32
#     if s in {"bf16", "fp16"}: return 16
#     if s in {"fp8", "int8", "8bit", "8-bit"}: return 8
#     if s in {"fp4", "nf4", "int4", "4bit", "4-bit"}: return 4
#     if s in {"int2", "2bit", "2-bit"}: return 2
        
#     # generic N bit(s)
#     if s.endswith("bit") or s.endswith("bits"):
#         n = "".join(ch for ch in s if ch.isdigit())
#         if n.isdigit():
#             return int(n)

#     # ExLlama-style: E4B, E5B, E8B
#     if re.fullmatch(r"e(\d+)b", s):
#         return int(s[1:-1])
            
#     # q*/iq* families (e.g., q4, q8_0, q4f16_1, q5_k_s, iq4_xs, iq3_m)
#     if s.startswith("iq"):
#         s = s[2:]
#     elif s.startswith("q"):
#         s = s[1:]
#     else:
#         return None

#     # take leading digits after q/iq
#     num = ""
#     for ch in s:
#         if ch.isdigit():
#             num += ch
#         else:
#             break
#     return int(num) if num else None

# def get_quant_level_from_hits(hits):
#     seq = hits if isinstance(hits, (list, tuple, set)) else ([hits] if hits else [])
#     levels = [b for b in (_token_bits(t) for t in seq) if b is not None]
#     return min(levels) if levels else 0  # choose the smallest level if multiple

# def get_quant_type_from_hits(hits):
#     seq = hits if isinstance(hits, (list, tuple, set)) else ([hits] if hits else [])
#     methods = sorted({str(t).lower() for t in seq if str(t).lower() in METHOD_TOKENS})
#     return "+".join(methods) if methods else "quant"

# llmdf['quant_signal'] = llmdf['repo'].apply(has_quant_signal)
# llmdf['quantization'] = llmdf['repo'].apply(collect_quant_hits)
# llmdf['quant_level'] = llmdf['quantization'].apply(get_quant_level_from_hits)
# llmdf['quant_type'] = llmdf['quantization'].apply(get_quant_type_from_hits)


In [6]:
QUANT_TOKENS = {
    "gptq", "awq", "exl2", "marlin", "spqr", "imatrix", "gguf", "ggml",
    "nf4", "fp4", "fp8", "fp16", "bf16", "fp32",
    "int4", "int8", "4-bit", "8-bit",
    "q2_k", "q3_k", "q4_k", "q5_k", "q6_k", "q8_0",  # GGUF-specific
    "iq1", "iq2", "iq3", "iq4"  # imatrix GGUF variants
}

QUANT_REGEX = re.compile(
    r"(?i)(?<![a-z0-9])("
    # GGUF/GGML patterns (most specific first)
    r"q[234568]_k(?:_[sml])?"  # Q4_K_M, Q5_K_S, etc.
    r"|iq[1234]_[a-z]{1,3}"     # IQ1_S, IQ2_XXS, IQ3_M, etc.
    r"|q[234568]_[01]"          # Q4_0, Q4_1, Q8_0, etc.
    r"|q[234568]f16(?:_[01])?"  # Q4f16_0, Q5f16_1
    # Standard quantization patterns
    r"|q[234568](?![_])"        # Plain Q2, Q3, Q4, etc. (but not Q4_K)
    r"|int[2345678]"
    r"|[2345678]\s*[-_ ]?\s*bits?"
    # Weight-activation patterns
    r"|w[0-9]+a[0-9]+(?:g[0-9]+)?"
    # Precision formats
    r"|nf4|fp4|fp8|fp16|bf16|fp32"
    # Method names
    r"|gptq|awq|exl2|marlin|spqr|imatrix|gguf|ggml"
    # ExLlama patterns
    r"|e[0-9]+b"
    # Bits per weight
    r"|bpw[0-9]+(?:\.[0-9]+)?"
    r"|[0-9]+(?:\.[0-9]+)?bpw"
    r")(?![a-z0-9])"
)

# Expanded method tokens to include GGUF/GGML
METHOD_TOKENS = {"gptq", "awq", "exl2", "marlin", "spqr", "imatrix", "gguf", "ggml", "hqq", "aqlm", "squeezellm"}

def has_quant_signal(text: str) -> bool:
    if not text:
        return False
    s = text.lower()
    # quick pass for common tokens
    if any(tok in s for tok in QUANT_TOKENS):
        return True
    # structured patterns
    return bool(QUANT_REGEX.search(s))

def collect_quant_hits(*texts):
    """Return sorted list of all quant tokens/regex matches found in given texts."""
    hits = set()
    for t in texts:
        if not t:
            continue
        if isinstance(t, (list, tuple, set)):
            for x in t:
                s = str(x).lower()
                hits.update(tok for tok in QUANT_TOKENS if tok in s)
                hits.update(m.group(0).lower() for m in QUANT_REGEX.finditer(s))
        else:
            s = str(t).lower()
            hits.update(tok for tok in QUANT_TOKENS if tok in s)
            hits.update(m.group(0).lower() for m in QUANT_REGEX.finditer(s))
    return sorted(hits)

def _token_bits(tok: str):
    s = str(tok).lower().strip()
    
    # GGUF/GGML quantization patterns (handle first)
    if s.startswith("iq"):
        # IQ patterns: iq1_s → 1, iq2_xxs → 2, iq3_m → 3, iq4_xs → 4
        if "1" in s: return 1
        if "2" in s: return 2
        if "3" in s: return 3
        if "4" in s: return 4
    
    # Q patterns with underscores (GGUF style)
    if re.match(r"q[234568]_", s):
        # Q4_K_M → 4, Q5_K_S → 5, Q8_0 → 8, Q4f16_0 → 4
        digits = re.search(r"q(\d+)", s)
        if digits:
            return int(digits.group(1))
    
    # Plain Q patterns (must check after Q_)
    if re.match(r"q[234568]$", s):
        return int(s[1])

    # weights-activations patterns
    if s.startswith("w") and "a" in s:
        i = 1
        w = ""
        while i < len(s) and s[i].isdigit():
            w += s[i]; i += 1
        k = s.find("a", i)
        if k != -1:
            k += 1
            a = ""
            while k < len(s) and s[k].isdigit():
                a += s[k]; k += 1
            bits = [int(x) for x in (w, a) if x]
            if bits:
                return min(bits)

    # bits-per-weight patterns
    bpw_match = re.search(r"(\d+(?:\.\d+)?)\s*bpw", s)
    if not bpw_match:
        bpw_match = re.search(r"bpw\s*(\d+(?:\.\d+)?)", s)
    if bpw_match:
        try:
            return int(float(bpw_match.group(1)))
        except ValueError:
            pass
    
    # direct precision tokens
    if s in {"fp32"}: return 32
    if s in {"bf16", "fp16"}: return 16
    if s in {"fp8", "int8", "8bit", "8-bit", "8 bit", "8 bits"}: return 8
    if s in {"fp4", "nf4", "int4", "4bit", "4-bit", "4 bit", "4 bits"}: return 4
    if s in {"int3", "3bit", "3-bit", "3 bit", "3 bits"}: return 3
    if s in {"int2", "2bit", "2-bit", "2 bit", "2 bits"}: return 2
        
    # N bit(s) patterns
    bit_match = re.match(r"(\d+)\s*[-_ ]?\s*bits?$", s)
    if bit_match:
        return int(bit_match.group(1))

    # ExLlama patterns: E4B → 4
    exl_match = re.match(r"e(\d+)b$", s)
    if exl_match:
        return int(exl_match.group(1))
    
    return None

def get_quant_level_from_hits(hits):
    seq = hits if isinstance(hits, (list, tuple, set)) else ([hits] if hits else [])
    levels = [b for b in (_token_bits(t) for t in seq) if b is not None]
    return min(levels) if levels else 0

def get_quant_type_from_hits(hits):
    seq = hits if isinstance(hits, (list, tuple, set)) else ([hits] if hits else [])
    seq_lower = [str(t).lower() for t in seq]
    
    # Check for GGUF/GGML specific patterns first
    has_gguf_pattern = any(
        re.match(r"(q[234568]_[k01]|iq[1234]_)", t) for t in seq_lower
    )
    if has_gguf_pattern or "gguf" in seq_lower or "ggml" in seq_lower:
        return "gguf"
    
    # Check for other specific methods
    methods = sorted({t for t in seq_lower if t in METHOD_TOKENS})
    if methods:
        # Return the most specific method if multiple
        priority = ["exl2", "awq", "gptq", "hqq", "aqlm", "marlin", "spqr", "imatrix", "squeezellm"]
        for p in priority:
            if p in methods:
                return p
        return methods[0]
    
    # If we have quantization signals but no specific method
    if seq_lower:
        return "quant"
    
    return "none"

# Apply to dataframe
llmdf['quant_signal'] = llmdf['repo'].apply(has_quant_signal)
llmdf['quantization'] = llmdf['repo'].apply(collect_quant_hits)
llmdf['quant_level'] = llmdf['quantization'].apply(get_quant_level_from_hits)
llmdf['quant_type'] = llmdf['quantization'].apply(get_quant_type_from_hits)

# Optional: Add a column to identify mixed/hybrid quantization
llmdf['is_mixed_quant'] = llmdf['quantization'].apply(
    lambda hits: len(set(get_quant_level_from_hits([h]) for h in hits if get_quant_level_from_hits([h]))) > 1
)


In [7]:
llmdf.to_pickle("llmdf.pkl")
llmdf.to_csv("repo_catalog.tsv", sep='\t')