In [None]:
!pwd

In [None]:
import pandas as pd
from Levenshtein import ratio
import re


# Pattern for CVE identifiers
pattern = r"CVE-\d{4}-\d{4,7}"


df_hf = pd.read_csv('./inference/distilbert/hf_disc.csv')
df_hf = df_hf.drop_duplicates(subset=["full_content"]).rename(columns={"full_content": "full_content"})
print(len(df_hf))

df_ghd = pd.read_csv('./inference/distilbert/gh_disc.csv')
df_ghd = df_ghd.drop_duplicates(subset=["discussion_title", "content"])
df_ghd['discussion_title_content'] = df_ghd['discussion_title'].astype(str) + '\n' + df_ghd['content'].astype(str)
df_ghd = df_ghd.rename(columns={"discussion_title_content": "full_content"})
print(len(df_ghd))

df_ghi = pd.read_csv('./inference/distilbert/gh_issues_distilbert_filtered.csv')
df_ghi = df_ghi.drop_duplicates(subset=["full_comment"]).rename(columns={"full_comment": "full_content"})
print(len(df_ghi))

In [None]:
df_ghd

In [None]:
pattern = r"CVE-\d{4}-\d{4,7}"

all_matches = []
count_matches = 0

for df in [df_hf, df_ghd, df_ghi]:
    matches = df['full_content'].str.findall(r"CVE-\d{4}-\d{4,7}")
    matches = matches[matches.apply(lambda x: len(x) > 0)]
    count_matches += matches.shape[0]
    all_matches.extend(matches.explode().tolist())
print(all_matches)
print("Number of rows containing CVE patterns:", count_matches)

# print(count_matches)
df_cve = pd.DataFrame(all_matches, columns=['cve'])
df_cve



In [None]:
print(len(df_cve))
df_cve = df_cve.drop_duplicates()
print(len(df_cve))
df_cve

In [None]:
import requests

def fetch_cve_record(cve_id):
    url = f"https://cveawg.mitre.org/api/cve/{cve_id}"
    response = None
    try:
        resp = requests.get(url, timeout=30)
        if resp.status_code == 200:
            response =  resp.json()
        else:
            response = None
    except Exception as e:
        response = None

    print(response)
    return response

df_cve['cve_record'] = df_cve['cve'].apply(fetch_cve_record)
df_cve.to_csv(
     './cve.csv',
        index=False
)

In [None]:
def fetch_cve_record(cve_id):
    print(cve_id)
    url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_id}"
    response = None
    try:
        resp = requests.get(url, timeout=30)
        if resp.status_code == 200:
            response =  resp.json()
        else:
            response = None
    except Exception as e:
        print(e)
        response = None
    print(response)
    return response


In [None]:
df_cve['cve_record_nvd'] = df_cve['cve'].apply(fetch_cve_record)
df_cve.to_csv(
     './cve.csv',
        index=False
)

In [None]:

# Filter records where cve_record_nvd is None and rerun fetch_cve_record
mask = df_cve['cve_record_nvd'].isna()
print(len(mask))
df_cve.loc[mask, 'cve_record_nvd'] = df_cve.loc[mask, 'cve'].apply(fetch_cve_record)
df_cve.to_csv(
     './cve.csv',
        index=False
)

In [None]:
def get_cwe(nvd_cve_record):

    cwe = set()
    vuls = nvd_cve_record.get('vulnerabilities', [])
    for vul in vuls:
        cve = vul.get("cve", {})
        print(cve["id"])
        weaknesses = cve.get("weaknesses")
        if not weaknesses:
            continue
        for weakness in weaknesses:
            cwe_desc = weakness.get("description")
            if cwe_desc:
                for cwe_desc in cwe_desc:
                    cwe_id = cwe_desc.get("value")
                    if cwe_id:
                        cwe.add(cwe_id)
        print(vul["cve"]["id"], list(cwe))
    return list(cwe)

def get_cve_status(nvd_cve_record):
    status = None
    vuls = nvd_cve_record.get('vulnerabilities', [])
    for vul in vuls:
        cve = vul.get("cve", {})
        print(cve["id"])
        status = cve.get("vulnStatus")
    return status

df_cve['cwe'] = df_cve['cve_record_nvd'].apply(get_cwe)
df_cve['cve_status'] = df_cve['cve_record_nvd'].apply(get_cve_status)
df_cve


In [None]:
def fetch_cwe_record(cwe_id):
    id = cwe_id.replace("CWE-", "")
    url = f"https://cwe-api.mitre.org/api/v1/cwe/weakness/{id}"
    result = None
    print(url)
    try:
        resp = requests.get(url, timeout=30,  verify=False)
        if resp.status_code == 200:
            data = resp.json()
            result =  data.get("Weaknesses", [])[0]["Name"] if data.get("Weaknesses") else None
    except Exception as e:
        print(e)
        result = None
    print(result)
    return result

def get_cwe_names(cwe_list):
    if not isinstance(cwe_list, list):
        return []
    return [fetch_cwe_record(cwe_id) for cwe_id in cwe_list]

df_cve['cwe_record'] = df_cve['cwe'].apply(get_cwe_names)
df_cve

In [None]:
from textwrap import dedent


def format_ref(nvd_cve_record, cwe_id, cwe_record):
    print(cwe_id)
    txt = dedent(
        """
        {cve_id}
        Description: {description}
        Weakness Enumeration: {cwe_id}
        """
    )

    cve_id = ""
    cve_description = ""
    vuls = nvd_cve_record.get('vulnerabilities', [])
    for vul in vuls:
        cve = vul.get("cve", {})
        cve_id = cve.get("id", "")
        cve_description = cve.get("descriptions", [])[0]["value"]


    cwe_str = ", ".join([f"{cwei} {cewr}" for cwei, cewr in zip(list(cwe_id), list(cwe_record))])
    print(cwe_str)
    return txt.format(cve_id=cve_id, description=cve_description, cwe_id=cwe_str)

df_cve['cve_nvd_ref'] = df_cve.apply(lambda row: format_ref(row['cve_record_nvd'], row['cwe'], row['cwe_record']), axis=1)
df_cve

In [None]:
df_cve.to_csv(
     './cve.csv',
    index=False
)

In [None]:
df_cve

In [None]:
df_cve.loc[mask, 'cve_record_nvd']

In [None]:
mapping_model_links = pd.read_csv(
    './mapping_models_links.csv')
mapping_gh_links = pd.read_csv(
    './mapping_repositories_url.csv')
print(len(mapping_model_links))
print(len(mapping_gh_links))

In [None]:
grouped = df.groupby('repo_name').agg({
    'issue_number': lambda x: list(x),
    'issue_number': 'count'
}).rename(columns={'issue_number': 'issue_count'})

grouped['issues'] = df.groupby('repo_name')['issue_number'].apply(list)
grouped['issue_count'] = df.groupby('repo_name')['issue_number'].count()
grouped = grouped.reset_index()

repo_to_link = dict(zip(mapping_gh_links['repo_name'], mapping_gh_links['github_link']))
grouped['github_link'] = grouped['repo_name'].map(repo_to_link)


# # Find model_id for each repo_name by checking if github_link is in github_links list
def find_model_id(github_link):
    for _, row in mapping_model_links.iterrows():
        links = [link.strip() for link in row['github_links'].strip('[]').split(',')] if isinstance(row['github_links'],
                                                                                                    str) else []
        if github_link in links:
            return row['model_id']
    return None


#
grouped['model_id'] = grouped['github_link'].apply(find_model_id)
# grouped

#
# # Create project_name
grouped['project_name'] = grouped['model_id'].astype(str) + '_' + grouped['repo_name']
#
# # Reorder columns
final_df = grouped[['project_name', 'model_id', 'repo_name', 'github_link', 'issues', 'issue_count']]
final_df.head()


In [None]:
final_df

In [None]:
final_df.to_csv(
    './mapping_issues.csv',
    index=False
)

In [None]:

# gh disc
df = pd.read_csv('./inference/distilbert/gh_disc.csv')
grouped_disc = df.groupby('repo_name').agg({
    'discussion_number': lambda x: list(x),
    'discussion_number': 'count'
}).rename(columns={'discussion_number': 'discussion_count'})

grouped_disc['discussions'] = df.groupby('repo_name')['discussion_number'].apply(list)
grouped_disc['discussions_count'] = df.groupby('repo_name')['discussion_number'].count()
grouped_disc = grouped_disc.reset_index()

repo_to_link = dict(zip(mapping_gh_links['repo_name'], mapping_gh_links['github_link']))
grouped_disc['github_link'] = grouped_disc['repo_name'].map(repo_to_link)
# grouped_disc

mapping_model_links["links"] = mapping_model_links["github_links"].apply(
    lambda x: [link.strip() for link in x.strip('[]').split(',')] if isinstance(x, str) else []
)

grouped_disc['model_id'] = grouped_disc['github_link'].apply(
    lambda link: next(
        (row['model_id'] for _, row in mapping_model_links.iterrows() if link in row['links']),
        None
    )
)

grouped_disc


In [None]:
grouped_disc.to_csv('./mapping_gh.csv',
                    index=False)

In [None]:
# hf disc
df = pd.read_csv('./inference/distilbert/hf_disc.csv')
grouped_disc = df.groupby('model_id').agg({
    'num': lambda x: list(x),
    'num': 'count'
}).rename(columns={'num': 'discussion_count'})

grouped_disc['discussions'] = df.groupby('model_id')['num'].apply(list)
grouped_disc['discussions_count'] = df.groupby('model_id')['num'].count()
grouped_disc = grouped_disc.reset_index()
grouped_disc

grouped_disc.to_csv('./mapping_hf.csv',
                    index=False)

In [None]:
import pandas as pd
import huggingface_hub
hf = pd.read_csv('./mapping_hf.csv')
hf

In [None]:
hf_api= huggingface_hub.HfApi()
def get_pipeline_from_hf(model_id):
    pipeline = None
    try:
        pipeline = hf_api.model_info(model_id).pipeline_tag
    except Exception as exc:
        print(exc)
    return pipeline

hf["pipeline_tag"] = hf['model_id'].apply(get_pipeline_from_hf)
hf

In [None]:
hf = pd.read_csv("./model_pipeline_tag.csv")
hf

In [None]:
# show the distribution
import matplotlib.pyplot as plt
import seaborn as sns

model_tags = hf_api.get_model_tags()
pipeline_tags = model_tags["pipeline_tag"]
subtype = set()
for tag in pipeline_tags:
    subtype.add(tag["subType"])
    # print(tag)
print("Subtypes:", subtype)

groups = dict.fromkeys(subtype)
for tag in pipeline_tags:
    if not groups[tag["subType"]]:
        groups[tag["subType"]] = []
    groups[tag["subType"]].append(tag["id"])
print(groups)

# Map pipeline_tags to their group based on the 'groups' dictionary
def map_tag_to_group(tag):
    tag = str(tag).strip().lower().replace(" ", "-")
    for group, tags in groups.items():
        if tag in tags:
            return group
    return "Not Provided"

hf['pipeline_group'] = hf['pipeline_tag'].apply(map_tag_to_group)

pipeline_group_counts = hf.groupby('pipeline_group')['model_id'].nunique().reset_index()
# pipeline_group_counts_filtered = hf[hf['pipeline_group'] != "Not Provided"].groupby('pipeline_group')['model_id'].nunique().reset_index()
pipeline_group_counts_filtered = hf.groupby('pipeline_group')['model_id'].nunique().reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=pipeline_group_counts_filtered.sort_values('model_id', ascending=False),
    x='pipeline_group',
    y='model_id',
)
plt.title('Number of Models by Task Type')
plt.xlabel('Task')
plt.ylabel('Number of Models')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10)
plt.show()
plt.show()
pipeline_group_counts = hf.groupby('pipeline_tag')['model_id'].nunique().reset_index()
# pipeline_group_counts = rq_df[rq_df['pipeline_group'] != "Not Provided"].groupby('pipeline_group')['model_id'].nunique().reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=pipeline_group_counts.sort_values('model_id', ascending=False),
    x='pipeline_tag',
    y='model_id',
)
plt.title('Number of Models by Pipeline Tags')
plt.xlabel('Pipeline Tags')
plt.ylabel('Number of Models')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10)
plt.show()
plt.show()



In [None]:
import numpy as np
# Count number of models per (pipeline_group, pipeline_tag)
tag_counts = (
    hf.groupby(['pipeline_group', 'pipeline_tag'])['model_id']
    .nunique()
    .reset_index()
    .rename(columns={'model_id': 'model_count'})
)

# Sort by group then descending count
tag_counts = tag_counts.sort_values(['pipeline_group', 'model_count'], ascending=[True, False])
# tag_counts= tag_counts.reset_index()
tag_counts = tag_counts.set_index(["pipeline_group","pipeline_tag"])
tag_counts

In [None]:
tag_counts.index.get_level_values(level=0)

In [None]:

fig, ax = plt.subplots(1,figsize=(20,5))
tag_counts.plot(kind='bar', xlabel='', ax=ax)
ax.set_xticklabels(tag_counts.index.get_level_values(level=1).tolist(), rotation=90)

for container in ax.containers:
    for i,child in enumerate(container.get_children()):
        if i == 0:
            ax.text(child.xy[0]+child.get_width(), -0.5, tag_counts.index.get_level_values(level=0)[0], ha='center', transform=ax.transAxes)
        elif i == 2:
            ax.text(child.xy[0]-(child.get_width()*2), -0.5, tag_counts.index.get_level_values(level=0)[2], ha='center', transform=ax.transAxes)

plt.show()

In [None]:
tag_counts = (
    hf.groupby(['pipeline_group', 'pipeline_tag'])['model_id']
    .nunique()
    .reset_index()
    .rename(columns={'model_id': 'model_count'})
)
tag_counts.set_index(["pipeline_group","pipeline_tag"])
tag_counts

In [None]:
hf

In [None]:
hf_api= huggingface_hub.HfApi()

def get_base_model(row):
    base_model_from_card = ""
    adapter = 0
    merge = 0
    quantized = 0
    finetune = 0
    base_or_downstream = "unknown"

    try:
        model_info_expand = hf_api.model_info(
            row["model_id"],
            expand=[
                "baseModels",
                "childrenModelCount",
                "downloadsAllTime",
                "trendingScore",
                "cardData",
                "tags"
            ]
        )

        card_data_dict = model_info_expand.card_data.to_dict() if model_info_expand.card_data else None
        if card_data_dict:
            card_data_dict = {
                k: v for k, v in card_data_dict.items()
                if not (any(x in k for x in ["extra_gated", "widget"]))  # remove this to save space
            }

        # base model can comes from tags or model cards
        base_model_from_card = None
        if card_data_dict:
            base_model_from_card = card_data_dict.get("base_model")

        chains = dict(model_info_expand.childrenModelCount)
        adapter = chains.get("adapter", 0)
        merge = chains.get("merge", 0)
        quantized = chains.get("quantized", 0)
        finetune = chains.get("finetune", 0)

        if base_model_from_card:
            base_or_downstream = 'downstream'
        else:
            if adapter > 0 or merge > 0 or quantized > 0 or finetune > 0:
                base_or_downstream = 'base'
            else:
                base_or_downstream = 'unknown'

    except Exception as e:
        print(e)

    print(base_model_from_card, adapter, merge, quantized, finetune, base_or_downstream)
    return base_model_from_card, adapter, merge, quantized, finetune, base_or_downstream

hf[["base_model_from_card_data", "adapter", "merge", "quantized", "finetune", "base_or_downstream"]] = hf.apply(get_base_model, axis=1, result_type="expand")
hf.to_csv("./model_base.csv")
hf

In [None]:
hf

In [None]:
hf["base_or_downstream"].value_counts()

In [None]:
hf_sec_count = hf[['base_or_downstream',"discussion_count"]].groupby('base_or_downstream')["discussion_count"].sum()
hf_sec_count

In [None]:

# base_model_pipeline_group_counts = rq_df.groupby('is_base_model')['model_id'].nunique().reset_index()
base_model_pipeline_group_counts = hf[hf['base_or_downstream'] != 'unknown'].groupby('base_or_downstream')['model_id'].nunique().reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(
    data=base_model_pipeline_group_counts.sort_values('base_or_downstream', ascending=False),
    x='base_or_downstream',
    y='model_id',
)
plt.title('Number of Models by Model Type')
plt.xlabel('Model Type')
plt.ylabel('Number of Models')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{int(height)}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='bottom', fontsize=10)
plt.show()
plt.show()
