In [1]:
import duckdb
import re
import csv
import sys
import ahocorasick
import seaborn as sns
import os

csv.field_size_limit(sys.maxsize)
import pandas as pd
from Levenshtein import ratio


def generate_corasick(kws: list[str]):
	automation = ahocorasick.Automaton()
	for idx, key in enumerate(kws):
		automation.add_word(key, (idx, key))
	automation.make_automaton()
	return automation


def extract_keywords(text: str, corasick_auto):
	kw = set()
	for end_index, (insert_order, original_value) in corasick_auto.iter(str(text).lower()):
		kw.add(original_value)
	return list(kw) if kw else None

In [None]:
con = duckdb.connect('../mining/result/models.db')

tables = [
	"models",
	"hf_discussions",
	"hf_discussion_events",
	"gh_repositories",
	"gh_discussions",
	"gh_comments",
	"gh_issues",
]

for table in tables:
	# df = con.execute(f"SELECT * from {table}").df()
	# df.to_csv(f"./db_dump_csv/{table}.csv", index=False)

	count = con.execute(f"SELECT COUNT(1) from {table}")
	print(f"{table=}, {count.fetchone()=}")

# con.execute(f"COPY {table} TO './db_dump_csv/{table}.csv' (HEADER, DELIMITER ',')")


In [None]:
df = pd.read_csv("./db_dump_csv/models.csv")

df["github_links_set"] = None
df["github_links_score"] = None
df["highest_score_link"] = None
df["highest_score"] = None

# file_path = "huggingface_models_likes_all.csv"
# df = pd.read_csv(file_path, )
# df.columns = [
#     "model_id", "downloads", "downloads_all_time", "likes", "trending_score", "pipeline_tags",
#     "tags", "card_data", "base_model_from_card_data", "scan_done", "files_with_issues",
#     "adapter_count", "merge_count", "quantized_count", "finetune_count"
# ]

print(df.info())
print(len(df))

df.drop_duplicates(subset=["model_id"], inplace=True)
print(len(df))

df.dropna(subset=["github_links"], inplace=True)
print(len(df))

to_remove = [
	")", "(", ",",
]

for index, row in df.iterrows():
	print(index, row["model_id"])
	link_str = row["github_links"]
	# df["github_links_set"] = df["github_links"].apply(extract_github_repo_link_set)
	# def extract_github_repo_link_set(link_str: str):
	if pd.isna(link_str) or link_str == "" or link_str == "[]":
		df.at[index, "github_links_set"] = None
		continue

	links = link_str.split(",")
	repo_link = None
	processed_links = set()
	# authors = set()
	# repos = set()

	for link in links:
		if link == "":
			continue
		link = link.strip()
		# remove special characters
		for char in to_remove:
			link = link.replace(char, "")

		# capture entire link
		if re.search(r"https?:\/\/?github\.com\/[\w-]+\/[\w-]+", link):
			repo_link = re.search(r"https?:\/\/?github\.com\/[\w-]+\/[\w-]+", link).group(0)
			processed_links.add(repo_link)
			author_repo = repo_link.replace("https://github.com/", "")

	# capture each group
	# author, repo = re.search(r"https?:\/\/?github\.com\/([\w-]+)\/([\w-]+)", link).groups()
	# authors.add(author)
	# repos.add(repo)

	if processed_links:
		processed_links = list(processed_links)
		df.at[index, "github_links_set"] = processed_links
		scores = [ratio(link, row["model_id"]) for link in processed_links]
		df.at[index, "github_links_score"] = scores
		df.at[index, "highest_score_link"] = processed_links[scores.index(max(scores))]
		df.at[index, "highest_score"] = max(scores)
# else:
# 	df.at[index, "github_links_set"] = None
# 	df.at[index, "github_links_score"] = None

# print(list(processed_links))
# row["github_author"] = authors if authors else None
# row["github_repos"] = repos if repos else None

df.to_csv("./filtered/hf_models_with_scored_link.csv", index=False)

df.dropna(subset=["highest_score"], inplace=True)
print(len(df))
print(df.highest_score_link.nunique())
df.to_csv("./filtered/hf_models.csv", index=False)

In [None]:
# print(len(df))
# df.drop_duplicates(subset=["model_id"], inplace=True)
# print(len(df))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(df['highest_score'].dropna(), bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Highest Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# filter other files
files = [
	"gh_comments.csv",
	"gh_discussions.csv",
	"gh_issues.csv",
	"gh_repositories.csv",
	"hf_discussion_events.csv",
	"hf_discussions.csv",
]

bots = [
	"allcontributors[bot]",
	"allstar-app[bot]",
	"azure-pipelines[bot]",
	"codeant-ai[bot]",
	"coderabbitai[bot]",
	"copybara-service[bot]",
	"dagshub[bot]",
	"deepsource-autofix[bot]",
	"dependabot-preview[bot]",
	"dependabot[bot]",
	"devin-ai-integration[bot]",
	"ellipsis-dev[bot]",
	"github-actions[bot]",
	"google-allstar-prod[bot]",
	"greenkeeper[bot]",
	"imgbot[bot]",
	"learn-build-service-prod[bot]",
	"lgtm-com[bot]",
	"linear[bot]",
	"lumberbot-app[bot]",
	"mend-for-github-com[bot]",
	"mergify[bot]",
	"microsoft-github-operations[bot]",
	"microsoft-github-policy-service[bot]",
	"opensearch-trigger-bot[bot]",
	"pre-commit-ci[bot]",
	"pull[bot]",
	"pytorch-bot[bot]",
	"renovate[bot]",
	"restyled-io[bot]",
	"sentry-io[bot]",
	"sourcery-ai[bot]",
	"stainless-app[bot]",
	"sweep-ai[bot]",
	"sweep-nightly[bot]",
	"sync-by-unito[bot]",
	"vs-code-engineering[bot]",
]

for file in files:
	# needs to remove bot issues
	df = pd.read_csv(f"./db_dump_csv/{file}")
	print(f"{file} original count {len(df)}")

	if "author_login" in list(df):
		df = df[~df['author_login'].isin(bots)]
	elif "user_login" in list(df):
		df = df[~df['user_login'].isin(bots)]

	df.drop_duplicates()

	print(f"{file} filtered count {len(df)}")
	df.to_csv(f"./filtered/{file}", index=False)

In [None]:
kws1 = ["vulnerability", "vulnerabilities", "cwe", "CWE", "cve", "CVE", "security"]

corasick = generate_corasick(kws1)

# df = pd.read_csv("./result/hf/hf_models_commits.csv")
# df["keywords"] = None
# for index, row in df.iterrows():
#     df.at[index, 'keywords_title'] = extract_keywords(row['title'], corasick)
#     df.at[index, 'keywords_message'] = extract_keywords(row['message'], corasick)
# print(f"Record with KW founds: {len(df[df['keywords_message'].notna()])} in {len(df)}")
# df[(df['keywords_title'].notna()) | (df['keywords_message'].notna())].to_csv(f"./result/hf/hf_commits_kws1.csv",
#                                                                              index=False)

# Find kws in HF

df = pd.read_csv("./filtered/hf_discussions.csv")
df["keywords"] = None
for index, row in df.iterrows():
	df.at[index, 'keywords'] = extract_keywords(row['title'], corasick)
print(f"HF Discussions with KW founds: {len(df[df['keywords'].notna()])} in {len(df)}")
df[df['keywords'].notna()].to_csv(f"./filtered_with_kw/hf_discussions.csv", index=False)

df = pd.read_csv("./filtered/hf_discussion_events.csv")
df["keywords"] = None
for index, row in df.iterrows():
	df.at[index, 'keywords'] = extract_keywords(row['content'], corasick)
print(f"HF Discussions Events with KW founds: {len(df[df['keywords'].notna()])} in {len(df)}")
df[df['keywords'].notna()].to_csv(f"./filtered_with_kw/hf_discussion_events.csv", index=False)

In [None]:
# Find kws in GH

df = pd.read_csv("./filtered/gh_discussions.csv")
df["keywords"] = None
for index, row in df.iterrows():
	df.at[index, 'keywords'] = extract_keywords(
		str(row['discussion_title']) + "\n" + str(row['discussion_body']),
		corasick
	)
print(f"GH Discussions KW founds: {len(df[df['keywords'].notna()])} in {len(df)}")
df[df['keywords'].notna()].to_csv(f"./filtered_with_kw/gh_discussions.csv", index=False)

df = pd.read_csv("./filtered/gh_comments.csv")
df["keywords"] = None
for index, row in df.iterrows():
	df.at[index, 'keywords'] = extract_keywords(row['comment_body'], corasick)
print(f"GH Comments KW founds: {len(df[df['keywords'].notna()])} in {len(df)}")
df[df['keywords'].notna()].to_csv(f"./filtered_with_kw/gh_comments.csv", index=False)

# gh_issues.csv
df = pd.read_csv("./filtered/gh_issues.csv")
df["keywords"] = None
for index, row in df.iterrows():
	df.at[index, 'keywords'] = extract_keywords(
		str(row['issue_title']) + "\n" + str(row['issue_body']),
		corasick
	)
print(f"GH Issues KW founds: {len(df[df['keywords'].notna()])} in {len(df)}")
df[df['keywords'].notna()].to_csv(f"./filtered_with_kw/gh_issues.csv", index=False)

In [None]:
# merge from filtered HF_DISCUSSION, GH_DISCUSSION and GH_ISSUES
# keep all discussions even if they do not have any comments
hf_discussions = pd.read_csv("./filtered/hf_discussions.csv")
hf_discussion_events = pd.read_csv("./filtered/hf_discussion_events.csv")
merged_hf = pd.merge(hf_discussions, hf_discussion_events, on=["model_id", "num"], how="left")
print("HF merged ", len(merged_hf))
merged_hf.drop(columns=["keywords_x", "keywords_y"], inplace=True)
merged_hf.to_csv("./merged/merged_hf_discussions.csv", index=False)

gh_discussions = pd.read_csv("./filtered/gh_discussions.csv")
gh_comments = pd.read_csv("./filtered/gh_comments.csv")
merged_gh = pd.merge(gh_discussions, gh_comments, on=["repo_name", "discussion_number"], how="left")
print("GH merged", len(merged_gh))
merged_gh.drop(columns=["id_x", "id_y"], inplace=True)
merged_gh.to_csv("./merged/merged_gh_discussions.csv", index=False)


In [None]:
# merge from filtered with keywords HF_DISCUSSION, GH_DISCUSSION and GH_ISSUES
def combine_keywords(row):
	keywords = set()
	if not pd.isna(row["keywords_x"]):
		kws = eval(row["keywords_x"])
		for kw in kws:
			keywords.add(kw)
	if not pd.isna(row["keywords_y"]):
		kws = eval(row["keywords_x"])
		for kw in kws:
			keywords.add(kw)
	if keywords:  # Check if the list is not empty
		return list(keywords)
	else:
		return None


hf_discussions = pd.read_csv("./filtered_with_kw/hf_discussions.csv")
hf_discussion_events = pd.read_csv("./filtered_with_kw/hf_discussion_events.csv")
merged_hf = pd.merge(hf_discussions, hf_discussion_events, on=["model_id", "num"], how="left")
merged_hf.drop_duplicates(inplace=True)
merged_hf["keywords"] = merged_hf.apply(combine_keywords, axis=1)
merged_hf.drop(columns=["keywords_x", "keywords_y"], inplace=True)
print(len(merged_hf))
merged_hf.to_csv("./merged_with_kw/merged_hf_discussions.csv", index=False)

gh_discussions = pd.read_csv("./filtered_with_kw/gh_discussions.csv")
gh_comments = pd.read_csv("./filtered_with_kw/gh_comments.csv")
merged_gh = pd.merge(gh_discussions, gh_comments, on=["repo_name", "discussion_number"], how="left")
merged_gh.drop_duplicates(inplace=True)
merged_gh["keywords"] = merged_gh.apply(combine_keywords, axis=1)
merged_gh.drop(columns=["keywords_x", "keywords_y", "id_x", "id_y"], inplace=True)
print("GH merged", len(merged_gh))
merged_gh.to_csv("./merged_with_kw/merged_gh_discussions.csv", index=False)

# Copy from filtered_with_kw to manual for manual labelling
# Process:
- manual label hf_discussion
- manual label hf_discussion_events
- find all discussion_event where hf_discussion is_security = 1
- reverse find all discussion where hf_discussion_event is_security = 1
=> manual dataset for HF discussion



In [None]:
# manually removed
# hf discussion events
# -> comment not english

# 6240,taide/TAIDE-LX-7B-Chat,7,663c7c7e5e9f6a229b09fbc0,comment -> comment not english
# 18855,BELLE-2/Belle-whisper-large-v3-turbo-zh,3,comment -> comment not english
# 115921,taide/TAIDE-LX-7B-Chat-4bit,4,66262481571552066e022e3b,comment
# 116245,taide/TAIDE-LX-7B-Chat-4bit,3,66274569a6a017b27d981884,comment
# 201341,taide/Llama3-TAIDE-LX-8B-Chat-Alpha1,4,6638c793f31d59ae35fa9a71,comment
# 201909,taide/Llama3-TAIDE-LX-8B-Chat-Alpha1,2,663389530175b82b87e76c43,comment
# 253944,taide/Llama3-TAIDE-LX-8B-Chat-Alpha1-4bit,3,663b08a1ffc4bb91c1042526,comment
# 399720,IDEA-CCNL/Ziya-LLaMA-13B-v1.1,3,64895cb884f4f879933f9bf0,comment
# BAAI/bge-m3,25,66812cf32698e064710ab834,comment

#   -> gated model
# google/paligemma-3b-pt-224,2,6643d0633914b80624ebddcb,comment


# 25674,CyberPeace-Institute/SecureBERT-NER,4,654e762654d044f09ee10c8f,comment
# https://huggingface.co/CyberPeace-Institute/SecureBERT-NER/discussions/4
# potential LLM issues, does not differentiate from the domain of the model and the content of the discussions

# Phind/Phind-CodeLlama-34B-v2,15,6508cedb2feb9570c5f964d8 -> ethics related, does it count as security
# linking between HF and GH: https://huggingface.co/OuteAI/OuteTTS-0.1-350M/discussions/5 -> https://github.com/edwko/OuteTTS/issues/16#issuecomment-2465288308

# https://huggingface.co/cgato/TheSpice-7b-v0.1.1/discussions/1 -> safetensor suggestion


# gh
# keras-team/keras-hub,1393 -> contain pr to security policy -> https://github.com/keras-team/keras-hub/pull/1319
# https://github.com/ShishirPatil/gorilla/discussions/457 -> contain pr to vul fix -> https://github.com/ShishirPatil/gorilla/pull/415



In [None]:
# reconstruct the hf comments links
# and set up label
df = pd.read_csv("./filtered_with_kw/hf_discussions.csv")
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in range(0, 7)}
df.drop_duplicates(inplace=True)
print("HF Discussions with n keywords:", counts)
df.to_csv("./manual/hf_discussions_working.csv", index=False)

df = pd.read_csv("./filtered_with_kw/hf_discussion_events.csv")
df["url"] = "https://huggingface.co/" + df["model_id"] + "/discussions/" + df["num"].astype(str) + "#" + df["event_id"]
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in range(0, 7)}
df.drop_duplicates(inplace=True)
print("HF Discussions Event with n keywords:", counts)
df.to_csv("./manual/hf_discussion_events_working.csv", index=False)

df = pd.read_csv("./filtered_with_kw/gh_discussions.csv")
df["url"] = "https://github.com/" + df["repo_name"] + "/discussions/" + df["discussion_number"].astype(str)
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in range(0, 7)}
df = df.drop(columns=["id"]).drop_duplicates(
	subset=["repo_name", "discussion_number", "discussion_title", "author_login"])
print("GH Discussions with n keywords:", counts)
df.to_csv("./manual/gh_discussions_working.csv", index=False)

df = pd.read_csv("./filtered_with_kw/gh_comments.csv")
df["url"] = "https://github.com/" + df["repo_name"] + "/discussions/" + df["discussion_number"].astype(str)
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in range(0, 7)}
df = df.drop(columns=["id"]).drop_duplicates()
print("GH Discussions Comment with n keywords:", counts)
df.to_csv("./manual/gh_comments_working.csv", index=False)

df = pd.read_csv("./filtered_with_kw/gh_issues.csv")
df["url"] = "https://github.com/" + df["repo_name"] + "/issues/" + df["issue_number"].astype(str)
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in [0, 1, 2, 3, 4, 5, 6]}
df = df.drop(columns=["id"]).drop_duplicates()
print("GH issues with n keywords:", counts)
df[df["keyword_count"] >= 4].to_csv("./manual/gh_issues_subset_working.csv", index=False)
df.to_csv("./manual/gh_issues_working.csv", index=False)
# 1st pass without removing dups
# HF Discussions with n keywords: {0: 0, 1: 17, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
# HF Discussions Event with n keywords: {0: 0, 1: 276, 2: 10, 3: 5, 4: 0, 5: 0, 6: 0}
# GH Discussions with n keywords: {0: 0, 1: 142, 2: 9, 3: 1, 4: 1, 5: 0, 6: 0}
# GH Discussions Comment with n keywords: {0: 0, 1: 92, 2: 2, 3: 0, 4: 0, 5: 0, 6: 0}
# GH issues with n keywords: {0: 0, 1: 21790, 2: 1381, 3: 507, 4: 73, 5: 4, 6: 0}

In [None]:
# test get cve in gh issues
# df = pd.read_csv("./filtered_with_kw/gh_issues.csv")
# df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
# counts = {n: len(df[df["keyword_count"] == n]) for n in [0, 1, 2, 3, 4, 5, 6]}
# print("GH issues with n keywords:", counts)
# df_cve = df[df["keywords"].str.contains("cve|CVE", na=False)]
# print(len(df_cve))
# df_cve
# NOTES: cve contains a lot of false positive

df = pd.read_csv("./filtered_with_kw/gh_issues.csv")
df["url"] = "https://github.com/" + df["repo_name"] + "/issues/" + df["issue_number"].astype(str)
df["is_security"] = -1
df["security_category"] = None
df["keyword_count"] = df["keywords"].apply(lambda x: len(eval(x)) if pd.notna(x) else 0)
counts = {n: len(df[df["keyword_count"] == n]) for n in [0, 1, 2, 3, 4, 5, 6]}
df = df.drop(columns=["id"]).drop_duplicates()
print("GH issues with n keywords:", counts)
df[df["keyword_count"] >= 3].to_csv("./manual/gh_issues_subset_3_working.csv", index=False)

In [None]:
# HF label reverse look up
hf_discussion = pd.read_csv("./manual/hf_discussions_done.csv")
hf_discussion_event = pd.read_csv("./manual/hf_discussion_events_done.csv")
merged_hf = pd.read_csv("./merged/merged_hf_discussions.csv")
print(f"{len(merged_hf)=}")
merged_hf.drop_duplicates(inplace=True)
# merged_hf.dropna(subset=["content"], inplace=True)
merged_hf["is_security"] = 0

df_non_security_disc = hf_discussion[hf_discussion["is_security"] == 0][["model_id", "num"]]
df_non_security_event = hf_discussion_event[hf_discussion_event["is_security"] == 0][["model_id", "num"]]
non_security_union = pd.concat([df_non_security_disc, df_non_security_event]).drop_duplicates()
print(f"{len(non_security_union)=}")

df_security_disc = hf_discussion[hf_discussion["is_security"] == 1][["model_id", "num"]]
df_security_event = hf_discussion_event[hf_discussion_event["is_security"] == 1][["model_id", "num"]]
security_union = pd.concat([df_security_disc, df_security_event]).drop_duplicates()
print(f"{len(security_union)=}")

# keep only the rows we labelled in manual set, either 0 or 1
merged_hf = merged_hf.merge(
	pd.concat([non_security_union, security_union]).drop_duplicates(),
	on=["model_id", "num"],
	how="inner"
)
print(f"{len(merged_hf)=}")

merged_hf["is_security"] = merged_hf[["model_id", "num"]].apply(tuple, axis=1).isin(
	pd.concat([df_security_disc, df_security_event]).apply(tuple, axis=1)
).astype(int)

merged_hf.loc[
	merged_hf[["model_id", "num"]].apply(tuple, axis=1).isin(security_union.apply(tuple, axis=1)),
	"is_security"
] = 1

merged_hf = merged_hf[merged_hf["event_type"] == "comment"]

merged_hf.to_csv("./merged_after_manual/merged_hf_discussions.csv", index=False)

merged_hf_sec = merged_hf[merged_hf["is_security"] == 1]
print(f"{len(merged_hf_sec)=}")
merged_hf_sec.to_csv("./merged_after_manual/merged_hf_discussions_security.csv", index=False)

# count the distinct combination of model_id and num
distinct_discussions = merged_hf[['model_id', 'num']].drop_duplicates().shape[0]
print(f"Distinct HF discussion: {distinct_discussions}")

distinct_discussions = merged_hf_sec[['model_id', 'num']].drop_duplicates().shape[0]
print(f"Distinct Security HF discussion: {distinct_discussions}")
# old
# len(merged_hf)=666055
# len(non_security_union)=130
# len(security_union)=127
# len(merged_hf)=2861
# len(merged_hf_sec)=1634
# Distinct HF discussion: 255
# Distinct Security HF discussion: 127

In [None]:
# GH label reverse look up
gh_discussion = pd.read_csv("./manual/gh_discussions_done.csv")
gh_comments = pd.read_csv("./manual/gh_comments_done.csv")
merged_gh = pd.read_csv("./merged/merged_gh_discussions.csv")
print(f"{len(merged_gh)=}")
merged_gh.drop_duplicates(inplace=True)
merged_gh["is_security"] = 0

df_non_security_disc = gh_discussion[gh_discussion["is_security"] == 0][["repo_name", "discussion_number"]]
print(f"{len(df_non_security_disc)=}")
df_non_security_cmt = gh_comments[gh_comments["is_security"] == 0][["repo_name", "discussion_number"]]
print(f"{len(df_non_security_cmt)=}")
non_security_union = pd.concat([df_non_security_disc, df_non_security_cmt]).drop_duplicates()
print(f"{len(non_security_union)=}")

df_security_disc = gh_discussion[gh_discussion["is_security"] == 1][["repo_name", "discussion_number"]]
print(f"{len(df_security_disc)=}")
df_security_cmt = gh_comments[gh_comments["is_security"] == 1][["repo_name", "discussion_number"]]
print(f"{len(df_security_cmt)=}")
security_union = pd.concat([df_security_disc, df_security_cmt]).drop_duplicates()
# print(security_union)
print(f"{len(security_union)=}")

merged_gh = merged_gh.merge(
	pd.concat([non_security_union, security_union]).drop_duplicates(),
	on=["repo_name", "discussion_number"],
	how="inner"
)
print(f"{len(merged_gh)=}")

merged_gh.loc[
	merged_gh[["repo_name", "discussion_number"]].apply(tuple, axis=1).isin(security_union.apply(tuple, axis=1)),
	"is_security"
] = 1
print(f"{len(merged_gh)=}")
merged_gh.to_csv("./merged_after_manual/merged_gh_discussions.csv", index=False)

merged_gh_sec = merged_gh[merged_gh["is_security"] == 1]
print(f"{len(merged_gh_sec)=}")
merged_gh_sec.to_csv("./merged_after_manual/merged_gh_discussions_security.csv", index=False)

# count the distinct combination of model_id and num
distinct_discussions = merged_gh[['repo_name', 'discussion_number']].drop_duplicates().shape[0]
print(f"Distinct GH discussion: {distinct_discussions}")

distinct_discussions = merged_gh_sec[['repo_name', 'discussion_number']].drop_duplicates().shape[0]
print(f"Distinct Security HF discussion: {distinct_discussions}")

# More manual check here on the before the final dataset

In [None]:
# manual + data from external
merged_gh = pd.read_csv("./merged_after_manual/merged_gh_discussions.csv")
merged_hf = pd.read_csv("./merged_after_manual/merged_hf_discussions.csv")
issues = pd.read_csv("./manual/gh_issues_subset_3_done.csv")
issues_external_sec = pd.read_csv("./external_issues/github_sec_issues.csv", delimiter=";")
issues_external_non_sec = pd.read_csv("./external_issues/github_nonsec_issues.csv", delimiter=";")

# repo_name,discussion_number,discussion_title,discussion_body,author_login_x,author_login_y,comment_body,is_security
merged_columns = [
	"id_name", "id_num", "type", "content", "is_security"
]

# all_df = pd.DataFrame(columns=merged_columns)
merged_gh["id_name"] = merged_gh["repo_name"]
merged_gh["id_num"] = merged_gh["discussion_number"]
merged_gh["content"] = (
	merged_gh["discussion_title"].fillna("").str.strip() + " " +
	merged_gh["discussion_body"].fillna("").str.strip() + " " +
	merged_gh["comment_body"].fillna("").str.strip()
).str.strip()
merged_gh["type"] = "GH_DISCUSSIONS"

# model_id,num,title,git_ref,url,event_id,event_type,content,is_security
merged_hf["id_name"] = merged_hf["model_id"]
merged_hf["id_num"] = merged_hf["num"]
merged_hf["content"] = (
	merged_hf["title"].fillna("").str.strip() + " " +
	merged_hf["content"].fillna("").str.strip()
).str.strip()
merged_hf["type"] = "HF_DISCUSSIONS"

# repo_name,issue_url,pr_from_issue,user_login,issue_number,keywords,url,issue_title,issue_body,is_security,security_category,keyword_count
issues["id_name"] = issues["repo_name"]
issues["id_num"] = issues["issue_number"]
issues["content"] = (
	issues["issue_title"].fillna("").str.strip() + " " +
	issues["issue_body"].fillna("").str.strip()
)
issues["type"] = "GH_ISSUES"

all_df = pd.concat(
	[
		merged_gh[merged_columns],
		merged_hf[merged_columns],
		issues[merged_columns]
	]
)
print("All manual records", len(all_df))
# shuffle
all_df = all_df.sample(frac=1)
all_df.to_csv("./merged_after_manual/merged_all.csv", index=False)

issues_external_sec["id_name"] = issues_external_sec["repository"]
issues_external_sec["id_num"] = issues_external_sec.apply(lambda x: int(str(x["issue_api_url"]).strip().split("/")[-1]),
														  axis=1)
issues_external_sec["content"] = (
	issues_external_sec["issue_title"].fillna("").str.strip() + " " +
	issues_external_sec["description"].fillna("").str.strip()
)
issues_external_sec["type"] = "GH_ISSUES_EXTERNAL"
issues_external_sec["is_security"] = 1

issues_external_non_sec["id_name"] = issues_external_non_sec["repository"]
issues_external_non_sec["id_num"] = issues_external_non_sec.apply(
	lambda x: int(str(x["issue_api_url"]).strip().split("/")[-1]), axis=1)
issues_external_non_sec["content"] = (
	issues_external_non_sec["issue_title"].fillna("").str.strip() + " " +
	issues_external_non_sec["description"].fillna("").str.strip()
)
issues_external_non_sec["type"] = "GH_ISSUES_EXTERNAL"
issues_external_non_sec["is_security"] = 0

all_df = pd.concat(
	[
		all_df,
		issues_external_sec[merged_columns],
		issues_external_non_sec[merged_columns]
	]
)
print("Manual records + external issues", len(all_df))
# shuffle
all_df = all_df.sample(frac=1)
all_df.to_csv("./merged_after_manual/merged_all_with_external.csv", index=False)

In [None]:
all_df

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold

# test stratify
train_df, temp_df = train_test_split(
	all_df,
	test_size=0.2,
	random_state=42,
	stratify=all_df[["type", "is_security"]]
)
print(len(train_df[train_df["type"] == "GH_DISCUSSIONS"]))
print(len(temp_df[temp_df["type"] == "GH_DISCUSSIONS"]))
print(len(train_df[train_df["type"] == "GH_ISSUES"]))
print(len(temp_df[temp_df["type"] == "GH_ISSUES"]))

print(len(train_df[train_df["is_security"] == 1]))
print(len(temp_df[temp_df["is_security"] == 1]))

skf = StratifiedKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(skf.split(train_df, train_df["is_security"])):
	print(f"Fold {i}:")
	print(f"  Train: index={train_index}")
	print(f"  Test:  index={test_index}")


# Multiple HF model can link to the same GH repo
-> We consider all of these related to 1 AI project, for example:
- meta-llama/Llama-2-7b
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Llama-2-70b-chat-hf
- meta-llama/Llama-2-7b-hf
- meta-llama/Llama-2-13b-chat-hf
- meta-llama/Llama-2-70b-hf
- meta-llama/Llama-2-13b-hf
- meta-llama/Llama-2-7b-chat
- meta-llama/Llama-2-70b
- meta-llama/Llama-2-70b-chat
- meta-llama/Llama-2-13b
- meta-llama/Llama-2-13b-chat

- All of these models link to http://github.com/facebookresearch/llama: -> belong to 1 project -> facebookresearch/llama.

In [None]:
# run model in classifier


note: https://www.oscs1024.com/ -> murphysec

In [None]:
df = pd.read_csv("prediction/backup_2402/merged_gh_discussions/raw_predictions.csv")
print(df["is_security_prediction"].value_counts())
df[df["is_security_prediction"] == 1]

In [4]:
# New test set to verify the model performance
# %load_ext cudf.pandas

# all data
merged_gh = pd.read_csv("./merged/merged_gh_discussions.csv")
merged_hf = pd.read_csv("./merged/merged_hf_discussions.csv")
issues = pd.read_csv("./filtered/gh_issues.csv")
print(f"{len(merged_gh)=}, {len(merged_hf)=}, {len(issues)=}")
# repo_name,discussion_number,discussion_title,discussion_body,author_login_x,author_login_y,comment_body,is_security
merged_columns = [
	"id_name", "id_num", "type", "content"
]

# all_df = pd.DataFrame(columns=merged_columns)
merged_gh["id_name"] = merged_gh["repo_name"]
merged_gh["id_num"] = merged_gh["discussion_number"]
merged_gh["content"] = (
	merged_gh["discussion_title"].fillna("").str.strip() + " " +
	merged_gh["discussion_body"].fillna("").str.strip() + " " +
	merged_gh["comment_body"].fillna("").str.strip()
).str.strip()
merged_gh["type"] = "GH_DISCUSSIONS"

# model_id,num,title,git_ref,url,event_id,event_type,content,is_security
merged_hf["id_name"] = merged_hf["model_id"]
merged_hf["id_num"] = merged_hf["num"]
merged_hf["content"] = (
	merged_hf["title"].fillna("").str.strip() + " " +
	merged_hf["content"].fillna("").str.strip()
).str.strip()
merged_hf["type"] = "HF_DISCUSSIONS"

# repo_name,issue_url,pr_from_issue,user_login,issue_number,keywords,url,issue_title,issue_body,is_security,security_category,keyword_count
issues["id_name"] = issues["repo_name"]
issues["id_num"] = issues["issue_number"]
issues["content"] = (
	issues["issue_title"].fillna("").str.strip() + " " +
	issues["issue_body"].fillna("").str.strip()
)
issues["type"] = "GH_ISSUES"

all_df = pd.concat(
	[
		merged_gh[merged_columns],
		merged_hf[merged_columns],
		issues[merged_columns]
	]
)
print("All records", len(all_df))
# shuffle
all_df = all_df.sample(frac=1)
all_df.to_csv("./merged/merged_all.csv", index=False)

# filter for the following:
# no manual label (also no keywords)
col_ids = ["id_name", "id_num"]
col_ids_str = "_".join(col_ids)
all_df[col_ids_str] = all_df[col_ids[0]].astype(str) + "_" + all_df[col_ids[1]].astype(str)

manual_merge_all = pd.read_csv("./merged_after_manual/merged_all.csv")
records_to_exclude = manual_merge_all[col_ids].drop_duplicates()
records_to_exclude[col_ids_str] = (
	records_to_exclude[col_ids[0]].astype(str) + "_" +
	records_to_exclude[col_ids[1]].astype(str)
)

all_df_filtered = all_df[~all_df[col_ids_str].isin(records_to_exclude[col_ids_str])]
print("All records no manual label", len(all_df_filtered))
# no hf safetensor and bot commit
all_df_filtered = all_df_filtered[
	~all_df_filtered["content"].str.startswith(
		("Adding `safetensors` variant of this model", "Upload folder using huggingface_hub", "Adding `diffusers` weights of this model"), na=False
	)
]
print("All records no manual label and no safetensor", len(all_df_filtered))

# no license-related
all_df_filtered = all_df_filtered[
	~all_df_filtered["content"].str.contains(
		("license|licenses|License|Licenses"), na=False, regex=True
	)
]
print("All records no manual label, no safetensor, no license", len(all_df_filtered))

# no keywords in gh issues (21k)
issues_with_kws = pd.read_csv("./filtered_with_kw/gh_issues.csv")
print("GH Issues with keywords", len(issues_with_kws))
records_to_exclude = issues_with_kws[["repo_name", "issue_number"]].drop_duplicates()
records_to_exclude[col_ids_str] = (
	records_to_exclude["repo_name"].astype(str) + "_" +
	records_to_exclude["issue_number"].astype(str)
)
print("GH Issues with keywords distinct count ", len(records_to_exclude))

all_df_filtered = all_df_filtered[~all_df_filtered[col_ids_str].isin(records_to_exclude[col_ids_str])]
print("All records no manual label, no safetensor, no license, no keywords", len(all_df_filtered))
all_df_filtered["is_security"] = -1

len(merged_gh)=47902, len(merged_hf)=666055, len(issues)=1994770
All records 2708727
All records no manual label 2703895
All records no manual label and no safetensor 2416730
All records no manual label, no safetensor, no license 2392552
GH Issues with keywords 23755
GH Issues with keywords distinct count  23750
All records no manual label, no safetensor, no license, no keywords 2372791


In [5]:
# test sampling
excluded_ids = [
	"open-compass/opencompass",
	"alibaba-damo-academy/FunASR",
	"QwenLM/Qwen-7B",
	"BAAI/llm-embedder",
	"fnlp/moss-moon-003-sft-plugin-int4",
	"alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
	"hiyouga/LLaMA-Factory",
	"OpenBMB/OmniLMM",
	"PaddlePaddle/PaddleOCR",
	"microsoft/vscode",
	"THUDM/ChatGLM-6B",
	"ymcui/Chinese-LLaMA-Alpaca-2",
	"netease-youdao/QAnything",
	"immersive-translate/immersive-translate",
	"ymcui/Chinese-LLaMA-Alpaca",
	"QwenLM/Qwen",
	"THUDM/GLM-4",
	"RVC-Boss/GPT-SoVITS",
	"viitor-ai/viitor-voice",
	"01-ai/Yi",
	"diffusers/controlnet-canny-sdxl-1.0",
	"shibing624/pycorrector",
	"baidu/Senta",
	"TuGraph-family/tugraph-db",
	"InternLM/InternLM"
]
filtered_df = all_df_filtered.loc[~all_df_filtered['id_name'].isin(excluded_ids)]
sampled_df = filtered_df.groupby("type").apply(lambda x: x.sample(n=33, random_state=42)).reset_index(drop=True)
sampled_df.to_csv("./test/test_99.csv", index=False)

  sampled_df = filtered_df.groupby("type").apply(lambda x: x.sample(n=33, random_state=42)).reset_index(drop=True)


In [6]:
test_99 = pd.read_csv("./test/test_99.csv")
test_99_done = pd.read_csv("./test/test_99_done.csv")
test_99.sort_values(by=["id_name", "id_num"], inplace=True)
test_99.to_csv("./test/test_99.csv", index=False)
test_99_done.sort_values(by=["id_name", "id_num"], inplace=True)
test_99_done.to_csv("./test/test_99_done.csv", index=False)

## Inference result

In [None]:
df = pd.read_csv("inference/all_gh_distilbert/raw_predictions.csv")
df_security = df[df["is_security_prediction"] == 1]
print(len(df_security))
df_security



In [None]:
def visualize_prob_sigmoid_distribution(file_path):
	try:
		df = pd.read_csv(file_path)

		if 'prob_sigmoid' not in df.columns:
			print(f"Error: 'prob_sigmoid' column not found in {file_path}")
			return

		if not all(0 <= x <= 1 for x in df['prob_sigmoid']):
			print("Warning: data points found out of 0-1 range. will try to filter...")
			df = df[(df['prob_sigmoid'] >= 0) & (df['prob_sigmoid'] <= 1)]
			print(f"Filtered {sum(not (0 <= x <= 1) for x in df['prob_sigmoid'])} out of range datapoints.")

		plt.figure(figsize=(10, 6))  # Adjust figure size as needed
		sns.histplot(df['prob_sigmoid'], kde=True, bins=30, color='skyblue')  #Histogram
		plt.title('Distribution of prob_sigmoid')
		plt.xlabel('prob_sigmoid')
		plt.ylabel('Frequency')
		plt.grid(axis='y', alpha=0.75)

		mean = df['prob_sigmoid'].mean()
		median = df['prob_sigmoid'].median()
		std = df['prob_sigmoid'].std()
		plt.axvline(mean, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean:.2f}')
		plt.axvline(median, color='green', linestyle='dashed', linewidth=1, label=f'Median: {median:.2f}')
		plt.legend()

		plt.tight_layout()
		plt.show()

	except FileNotFoundError:
		print(f"Error: File not found at {file_path}")
	except pd.errors.EmptyDataError:
		print(f"Error: File {file_path} is empty.")
	except pd.errors.ParserError:
		print(f"Error: Could not parse {file_path}. Check file format.")
	except Exception as e:
		print(f"An unexpected error occurred: {e}")


file_path = 'inference/all_gh_bert_gh_last/raw_predictions.csv'
visualize_prob_sigmoid_distribution(file_path)

In [None]:
df = pd.read_csv("inference/all_hf_bert_hf_best/raw_predictions.csv")
df_security = df[df["is_security_prediction"] == 1]
print(len(df_security))

file_path = 'inference/all_hf_bert_hf_best/raw_predictions.csv'
visualize_prob_sigmoid_distribution(file_path)


In [2]:
import os


# prediction result collection
def collect_prediction(path: str, exclude: list[str]):
	models = ["bert_base", "distilbert", "securebert", "roberta_base", "secbert", "secroberta", ]
	all_metrics_data = []

	subfolders = [
		f.path
		for f in os.scandir(path)
		if f.is_dir() and f.name not in exclude
	]

	for subfolder in subfolders:
		metrics_file_path = os.path.join(subfolder, "metrics.csv")
		if not os.path.exists(metrics_file_path):
			continue
		df = pd.read_csv(metrics_file_path)
		# only get the test result
		df = df.tail(1)
		model_type = None
		data_type = None

		match = re.search(r"_({})".format("|".join(models)), os.path.basename(subfolder))
		if match:
			model_type = str(os.path.basename(subfolder)[match.start():]).replace("_", "", 1)
			data_type = os.path.basename(subfolder)[:match.start()]

		df["input"] = data_type
		df["model_type"] = model_type
		df["subfolder"] = os.path.basename(subfolder)
		df["folder"] = path
		all_metrics_data.append(df)

	return pd.concat(all_metrics_data, ignore_index=True)


path = "./prediction"
exclude = ["backup_2402"]
metrics_predictions = collect_prediction(path, exclude)
metrics_predictions

Unnamed: 0,fold,epoch,train_loss,eval_loss,test_loss,accuracy,precision,recall,f1,f1_macro,mcc,elapsed_time,input,model_type,subfolder,folder
0,,,,,0.36167,0.950911,0.960799,0.897099,0.927857,0.945328,0.891926,15753.684496,all_external,securebert,all_external_securebert,./prediction
1,,,,,0.352707,0.975,0.931818,0.911111,0.921348,0.953243,0.906569,518.330959,manual_gh,distilbert,manual_gh_distilbert,./prediction
2,,,,,0.180531,0.973031,0.930328,0.965957,0.947808,0.964813,0.929933,3380.589169,all,bert_base,all_bert_base,./prediction
3,,,,,0.494877,0.948718,0.947826,1.0,0.973214,0.686607,0.486782,223.580802,manual_gh_issue,distilbert,manual_gh_issue_distilbert,./prediction
4,,,,,0.382849,0.958688,0.948107,0.933702,0.940849,0.954554,0.909175,8007.931883,all_external,distilbert,all_external_distilbert,./prediction
5,,,,,0.166664,0.986792,0.939759,0.975,0.957055,0.974626,0.949479,1971.510443,manual_hf,roberta_base,manual_hf_roberta_base,./prediction
6,,,,,0.937315,0.888889,0.98,0.899083,0.937799,0.7089,0.464872,237.166367,manual_gh_issue,secroberta,manual_gh_issue_secroberta,./prediction
7,,,,,0.25439,0.967638,0.939914,0.931915,0.935897,0.957126,0.914268,3458.170598,all,securebert,all_securebert,./prediction
8,,,,,0.370002,0.948238,0.910299,0.946133,0.92787,0.943753,0.887932,16331.838002,all_external,bert_base,all_external_bert_base,./prediction
9,,,,,0.133817,0.978571,0.897959,0.977778,0.93617,0.961647,0.924531,1042.465266,manual_gh,roberta_base,manual_gh_roberta_base,./prediction


In [None]:
path = "./prediction_0403"
collect_prediction(path, exclude)

In [3]:
path = "./prediction_tuned"
collect_prediction(path, exclude)

Unnamed: 0,f1_macro,mcc,accuracy,precision,recall,f1,fold,epoch,train_loss,eval_loss,test_loss,elapsed_time,input,model_type,subfolder,folder
0,0.977271,0.954541,0.982759,0.966102,0.966102,0.966102,,,,,0.125889,70921.045327,all_9_train,bert_base,all_9_train_bert_base,./prediction_tuned
1,0.973111,0.946319,0.979504,0.95,0.970213,0.96,,,,,0.100091,25556.112959,all,bert_base,all_bert_base,./prediction_tuned
2,0.965798,0.931595,0.97411,0.948936,0.948936,0.948936,,,,,0.075606,17573.26517,all,securebert,all_securebert,./prediction_tuned
3,0.98314,0.966527,0.987069,0.959016,0.991525,0.975,,,,,0.047132,49292.643998,all_9_train,secbert,all_9_train_secbert,./prediction_tuned
4,0.971603,0.94338,0.978365,0.944444,0.971429,0.957746,10.0,3.0,0.012996,0.083561,,40273.36934,all_9_train,secroberta,all_9_train_secroberta,./prediction_tuned
5,0.966989,0.934078,0.975189,0.96087,0.940426,0.950538,,,,,0.097227,9693.202135,all,secroberta,all_secroberta,./prediction_tuned
6,0.428571,0.0,0.75,0.0,0.0,0.0,,,,,0.475672,61.06229,,,sample,./prediction_tuned
7,0.42716,0.0,0.74569,0.0,0.0,0.0,,,,,0.567494,44149.730483,all_9_train,securebert,all_9_train_securebert,./prediction_tuned
8,0.965906,0.931812,0.974138,0.949153,0.949153,0.949153,,,,,0.113657,75160.76586,all_9_train,roberta_base,all_9_train_roberta_base,./prediction_tuned
9,0.964119,0.928336,0.973031,0.956522,0.93617,0.946237,,,,,0.102124,13538.628879,all,secbert,all_secbert,./prediction_tuned


In [26]:
import json


def collect_inference(path: str, exclude: list[str]):
	models = ["distilbert", "securebert", "roberta", "secbert", "secroberta", "bert"]
	all_metrics_data = []

	subfolders = [
		f.path
		for f in os.scandir(path)
		if f.is_dir() and f.name not in exclude
	]

	for subfolder in subfolders:
		metrics_file_path = os.path.join(subfolder, "metrics.json")
		if not os.path.exists(metrics_file_path):
			continue
		with open(metrics_file_path, 'r') as f:
			data = json.load(f)
		df = pd.DataFrame([data])
		# only get the test result
		model_type = None
		data_type = None

		match = re.search(r"_({})".format("|".join(models)), os.path.basename(subfolder))
		if match:
			model_type = str(os.path.basename(subfolder)[match.start():match.end()]).replace("_", "", 1)
			data_type = os.path.basename(subfolder)[:match.start()]

		df["input"] = data_type
		df["model_type"] = model_type
		df["subfolder"] = os.path.basename(subfolder)
		df["path"] = path
		all_metrics_data.append(df)
	return pd.concat(all_metrics_data, ignore_index=True)


path = "./inference"
exclude = []
metrics_infer = collect_inference(path, exclude)
metrics_infer

Unnamed: 0,time_taken,security_comments,total_comments,ratio_comments,security_discussion_sigmoid,security_discussions_softmax,total_discussions,ratio_discussions_sigmoid,ratio_discussions_softmax,input,model_type,subfolder,path
0,29953.621459,606096,1994770,0.303843,605972,605972,1994441,0.30383,0.30383,all_gh_issue,distilbert,all_gh_issue_distilbert_tuned_all_9_train,./inference
1,58758.29836,809643,1994770,0.405883,809454,809454,1994441,0.405855,0.405855,all_gh_issue,bert,all_gh_issue_bert_tuned_all_9_train,./inference
2,472.382529,2977,47902,0.062148,1852,1852,28185,0.065709,0.065709,all_gh,roberta,all_gh_roberta_tuned_all,./inference
3,23529.002314,206502,666055,0.310037,86186,86186,205650,0.419091,0.419091,all_hf,secbert,all_hf_secbert_tuned_all,./inference
4,21785.914553,313939,666055,0.471341,102272,102272,205650,0.497311,0.497311,all_hf,secroberta,all_hf_secroberta_tuned_all,./inference
5,496.260377,5812,47902,0.121331,3654,3654,28185,0.129643,0.129643,all_gh,distilbert,all_gh_distilbert_tuned_all,./inference
6,20417.561865,194379,1994770,0.097444,194332,194332,1994441,0.097437,0.097437,all_gh_issue,roberta,all_gh_issue_roberta_tuned_all,./inference
7,6.764755,74,423,0.174941,29,29,98,0.295918,0.295918,test_full,securebert,test_full_securebert_tuned_all_9_train,./inference
8,14327.584995,352399,666055,0.529084,116014,116014,205650,0.564133,0.564133,all_hf,distilbert,all_hf_distilbert_hf_last,./inference
9,0.960328,12,99,0.121212,12,12,99,0.121212,0.121212,test_99,distilbert,test_99_distilbert_tuned_all,./inference


In [None]:
import json


def collect_inference(path: str, exclude: list[str]):
	models = ["llama", "deepseek", "phi4", "mistral"]
	all_metrics_data = []

	subfolders = [
		f.path
		for f in os.scandir(path)
		if f.is_dir() and f.name not in exclude
	]

	for subfolder in subfolders:
		metrics_file_path = os.path.join(subfolder, "metrics.json")
		if not os.path.exists(metrics_file_path):
			continue
		with open(metrics_file_path, 'r') as f:
			data = json.load(f)
		df = pd.DataFrame([data])
		# only get the test result
		df["subfolder"] = os.path.basename(subfolder)
		model = None
		data = None
		match = re.search(r"_(llama\d*|deepseekr\d*|phi4|mistral(_small)?)", os.path.basename(subfolder))
		if match:
			model = str(os.path.basename(subfolder)[match.start():]).replace("_", "", 1)
			data = os.path.basename(subfolder)[:match.start()]
		df["input_type"] = data
		df["model"] = model
		df["path"] = path
		all_metrics_data.append(df)
	return pd.concat(all_metrics_data, ignore_index=True)


path = "./llm"
exclude = []
metrics_infer = collect_inference(path, exclude)
metrics_infer

In [None]:
import pandas as pd
from scipy.stats import norm


def get_subset(df_security, df_manual, col_ids: list[str]):
	records_to_exclude = df_manual[col_ids].drop_duplicates()
	col_ids_str = "_".join(col_ids)
	records_to_exclude[col_ids_str] = records_to_exclude[col_ids[0]].astype(str) + "_" + records_to_exclude[
		col_ids[1]].astype(str)
	print(f"{len(records_to_exclude)=}")
	N = len(df_security)  # Population size
	Z = norm.ppf(0.975)  # Z-score for 95% confidence
	p = 0.5  # Worst-case scenario proportion
	E = 0.05  # Margin of error

	# Sample size formula
	numerator = (N * (Z ** 2) * p * (1 - p))
	denominator = ((E ** 2) * (N - 1)) + ((Z ** 2) * p * (1 - p))
	sample_size = int(numerator / denominator)
	print(f"Required sample size: {sample_size} out of {N}")

	sample_df = pd.DataFrame()
	while len(sample_df) < sample_size:
		print(f"Current {len(sample_df)=}, {sample_size-len(sample_df)=}")
		temp_sample = df_security.sample(n=(sample_size - len(sample_df)), random_state=42)
		temp_sample[col_ids_str] = temp_sample[col_ids[0]].astype(str) + "_" + temp_sample[col_ids[1]].astype(str)
		temp_sample = temp_sample[~temp_sample[col_ids_str].isin(records_to_exclude[col_ids_str])]
		sample_df = pd.concat([sample_df, temp_sample])

	print(f"Final sample {len(sample_df)=}")
	return sample_df

# expr = "all_gh_bert_gh_last"
# gh = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
# gh_security = gh[gh["is_security_prediction"] == 1]
# gh_manual = pd.read_csv(f"./merged_after_manual/merged_gh_discussions.csv")
# gh_security_sampled = get_subset(gh_security, gh_manual, ["repo_name", "discussion_number"])
# gh_security_sampled.to_csv(f"./sampled/{expr}_sampled.csv", index=False)

In [None]:
expr = "all_gh_issue_distilbert_tuned_all"
hf = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
hf_security = hf[
	(hf["is_security_prediction"] == 1) & ~(
		(hf["title"] == "Adding `safetensors` variant of this model") |
		(hf["title"] == "Upload folder using huggingface_hub")
	)
	]
hf_manual = pd.read_csv(f"./merged_after_manual/merged_hf_discussions.csv")
hf_manual = hf_manual[
	~(
		(hf_manual["title"] == "Adding `safetensors` variant of this model") |
		(hf_manual["title"] == "Upload folder using huggingface_hub")
	)
]
hf_security_sampled = get_subset(hf_security, hf_manual, ["model_id", "num"])
hf_security_sampled.to_csv(f"./sampled/{expr}_sampled.csv", index=False)

In [None]:
expr = "all_hf_roberta_hf_best"
hf = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
print(len(hf))
hf = hf[
	~(
		(hf["title"] == "Adding `safetensors` variant of this model") |
		(hf["title"] == "Upload folder using huggingface_hub")
	)
]
len(hf)

path = "./llm"
exclude = []
metrics_infer = collect_inference(path, exclude)
metrics_infer

In [None]:
import pandas as pd
from scipy.stats import norm


def get_subset(df_security, df_manual, col_ids: list[str]):
	records_to_exclude = df_manual[col_ids].drop_duplicates()
	col_ids_str = "_".join(col_ids)
	records_to_exclude[col_ids_str] = records_to_exclude[col_ids[0]].astype(str) + "_" + records_to_exclude[
		col_ids[1]].astype(str)
	print(f"{len(records_to_exclude)=}")
	N = len(df_security)  # Population size
	Z = norm.ppf(0.975)  # Z-score for 95% confidence
	p = 0.5  # Worst-case scenario proportion
	E = 0.05  # Margin of error

	# Sample size formula
	numerator = (N * (Z ** 2) * p * (1 - p))
	denominator = ((E ** 2) * (N - 1)) + ((Z ** 2) * p * (1 - p))
	sample_size = int(numerator / denominator)
	print(f"Required sample size: {sample_size} out of {N}")

	sample_df = pd.DataFrame()
	while len(sample_df) < sample_size:
		print(f"Current {len(sample_df)=}, {sample_size-len(sample_df)=}")
		temp_sample = df_security.sample(n=(sample_size - len(sample_df)), random_state=42)
		temp_sample[col_ids_str] = temp_sample[col_ids[0]].astype(str) + "_" + temp_sample[col_ids[1]].astype(str)
		temp_sample = temp_sample[~temp_sample[col_ids_str].isin(records_to_exclude[col_ids_str])]
		sample_df = pd.concat([sample_df, temp_sample])

	print(f"Final sample {len(sample_df)=}")
	return sample_df


expr = "all_gh_bert_gh_last"
gh = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
gh_security = gh[gh["is_security_prediction"] == 1]
gh_manual = pd.read_csv(f"./merged_after_manual/merged_gh_discussions.csv")
gh_security_sampled = get_subset(gh_security, gh_manual, ["repo_name", "discussion_number"])
gh_security_sampled.to_csv(f"./sampled/{expr}_sampled.csv", index=False)

In [None]:
expr = "all_hf_bert_tuned_all"
hf = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
hf_security = hf[
	(hf["is_security_prediction"] == 1) & ~(
		(hf["title"] == "Adding `safetensors` variant of this model") |
		(hf["title"] == "Upload folder using huggingface_hub")
	)
	]
hf_manual = pd.read_csv(f"./merged_after_manual/merged_hf_discussions.csv")
hf_manual = hf_manual[
	~(
		(hf_manual["title"] == "Adding `safetensors` variant of this model") |
		(hf_manual["title"] == "Upload folder using huggingface_hub")
	)
]
hf_security_sampled = get_subset(hf_security, hf_manual, ["model_id", "num"])
hf_security_sampled.to_csv(f"./sampled/{expr}_sampled.csv", index=False)

In [7]:
expr = "all_hf_secbert_tuned_all_9_train"
hf = pd.read_csv(f"./inference/{expr}/raw_predictions.csv")
print("Total ",len(hf))
print("Total class 1 ",len(hf[hf["is_security_prediction"] == 1]))
hf = hf[
	~(
		(hf["title"] == "Adding `safetensors` variant of this model") |
		(hf["title"] == "Upload folder using huggingface_hub")
	)
]
print("Filtered ", len(hf))
hf = hf[hf["is_security_prediction"] == 1]
print("Filtered class 1 ", len(hf))
# hf.to_csv("./sampled/bert_hf_test.csv", index=False)

Total  666055
Total class 1  220137
Filtered  382782
Filtered class 1  67383


# find overlapping results

In [None]:
# # gh
# # keep all the duplicates rows where is_security_prediction is 1, drop the rest
# gh_dfs = [
# 	pd.read_csv(f"./inference/{folder}/raw_predictions.csv")
# 	for folder in [
# 		"all_gh_roberta_tuned_all",
# 		"all_gh_bert_tuned_all",
# 		"all_gh_distilbert_tuned_all",
# 		"all_gh_securebert_tuned_all",
# 		"all_gh_secbert_tuned_all",
# 		"all_gh_secroberta_tuned_all",
# 	]
# ]
# 
# for df in gh_dfs:
# 	df = df[df["is_security_prediction"] == 1]
# 	print(len(df))
# 
# # duplicated = gh_df.duplicated(subset=["repo_name", "discussion_number", "is_security_prediction"], keep="first")
# # duplicated = gh_df.duplicated()
# 
# gh_df = (
# 	gh_dfs[0]
# 	.merge(gh_dfs[1], on=["repo_name", "discussion_number", "is_security_prediction"], how="inner")
# 	.merge(gh_dfs[2], on=["repo_name", "discussion_number", "is_security_prediction"], how="inner")
# 	.merge(gh_dfs[3], on=["repo_name", "discussion_number", "is_security_prediction"], how="inner")
# 	.merge(gh_dfs[4], on=["repo_name", "discussion_number", "is_security_prediction"], how="inner")
# 	.merge(gh_dfs[5], on=["repo_name", "discussion_number", "is_security_prediction"], how="inner")
# )
# 
# gh_df = gh_df.drop_duplicates(
# 	subset=["repo_name", "discussion_number", "is_security_prediction"],
# 	keep="first"
# )
# # gh_df = gh_df[duplicated].drop_duplicates()
# print(len(gh_df))
# gh_df = gh_df[gh_df["is_security_prediction"] == 1]
# print(len(gh_df))
# gh_df.to_csv("./overlap/gh.csv", index=False)
# gh_df.head()

In [5]:
def get_overlap(folders, columns, export_to):
	dfs = [
		pd.read_csv(f"./inference/{folder}/raw_predictions.csv")
		for folder in folders
	]
	df = pd.concat(dfs)
	print(f"{len(df)=}")
	df = df[df["is_security_prediction"] == 1]
	print(f"Security only {len(df)=}")
	df_counts = df.value_counts(columns, dropna=False)
	print(f"Unique value counts over columns {len(df_counts)=}")
	df_counts = df_counts[df_counts >= len(folders)].reset_index()
	print(f"Filtered value counts over columns {len(df_counts)=}")
	df_counts.drop_duplicates(subset=columns, inplace=True)
	print(f"Drop duplicated over columns {len(df_counts)=}")
	df_counts.to_csv(export_to, index=False)
	return df_counts

In [8]:
gh_overlap = get_overlap(
	folders=[
		"all_gh_roberta_tuned_all",
		"all_gh_bert_tuned_all",
		"all_gh_distilbert_tuned_all",
		"all_gh_securebert_tuned_all",
		"all_gh_secbert_tuned_all",
		"all_gh_secroberta_tuned_all",
	],
	columns=[
		"repo_name", "discussion_number", "discussion_title", "discussion_body", "author_login_x",
		"author_login_y", "comment_body", "full_comment", "is_security_prediction"
	],
	export_to="./overlap/gh_no_filter.csv",
)
print(len(gh_overlap))

gh_overlap = gh_overlap[
	~(
		(gh_overlap["repo_name"] == "prometheus/prometheus") |
		(gh_overlap["repo_name"] == "git-lfs/git-lfs")
	)
]

print(len(gh_overlap))
gh_overlap.sort_values(by=["repo_name", "discussion_number"], inplace=True)
gh_overlap.to_csv("./overlap/gh.csv", index=False)
gh_overlap.head(5)

len(df)=287412
Security only len(df)=35541
Unique value counts over columns len(df_counts)=17108
Filtered value counts over columns len(df_counts)=432
Drop duplicated over columns len(df_counts)=432
432
325


Unnamed: 0,repo_name,discussion_number,discussion_title,discussion_body,author_login_x,author_login_y,comment_body,full_comment,is_security_prediction,count
308,ACEsuit/mace,337,ML-MACE enabled LAMMPS install,"Dear users,\nI am running into C++ compiler co...",sumanbhasker89,wcwitt,"Hi, apologies, but we don't have tons of exper...","ML-MACE enabled LAMMPS install Dear users,\nI ...",1.0,6
431,Akegarasu/lora-scripts,323,Run lora-scripts on cpu only,Is there any way to install lora-scripts on a ...,brcisna,,,Run lora-scripts on cpu only Is there any way ...,1.0,6
385,AlUlkesh/stable-diffusion-webui-images-browser,44,Please rebuild cache if you use prompts with n...,"We just found a bug, where only the last line ...",AlUlkesh,,,Please rebuild cache if you use prompts with n...,1.0,6
5,Bing-su/adetailer,470,Unsafe Files???,What's up with Huggingface saying 5 Adetailer ...,gohan2091,Bing-su,https://huggingface.co/docs/hub/security-pickl...,Unsafe Files??? What's up with Huggingface say...,1.0,6
50,Bing-su/adetailer,470,Unsafe Files???,What's up with Huggingface saying 5 Adetailer ...,gohan2091,gohan2091,Does that answer my question though? Is it say...,Unsafe Files??? What's up with Huggingface say...,1.0,6


In [13]:
hf_overlap = get_overlap(
	folders=[
		"all_hf_securebert_tuned_all",
		"all_hf_distilbert_tuned_all",
		"all_hf_bert_tuned_all",
		"all_hf_roberta_tuned_all",
		"all_hf_secroberta_tuned_all",
		"all_hf_secbert_tuned_all",
	],
	columns=[
		"model_id", "num", "title", "git_ref", "url",
		# "event_id",
		"event_type", 
		"content", "full_comment", "is_security_prediction",
	],
	export_to="./overlap/hf_no_filter.csv",
)
# hf further removal of safetensor and bot commit
hf_overlap = hf_overlap[
	~(
		(hf_overlap["full_comment"].str.contains("Adding `safetensors` variant of this model")) |
		(hf_overlap["full_comment"].str.contains("Upload folder using huggingface_hub")) |
		(hf_overlap["full_comment"].str.contains(("license|licenses|License|Licenses"), na=False, regex=True)) |
		(hf_overlap["event_type"] != "comment")
	)
]
print(len(hf_overlap))
hf_overlap.sort_values(by=["model_id", "num"], inplace=True)
hf_overlap.to_csv("./overlap/hf.csv", index=False)

len(df)=3996330
Security only len(df)=1400222
Unique value counts over columns len(df_counts)=403918
Filtered value counts over columns len(df_counts)=155853
Drop duplicated over columns len(df_counts)=155853
2180


In [14]:
hf_overlap

Unnamed: 0,model_id,num,title,git_ref,url,event_type,content,full_comment,is_security_prediction,count
103847,0dAI/0dAI-7.5B-v2-4bpw,1,Files are missing,,https://huggingface.co/0dAI/0dAI-7.5B-v2-4bpw/...,comment,En el `model.safetensors.index.json` se indica...,Files are missing En el `model.safetensors.ind...,1.0,6
103802,0xJustin/Dungeons-and-Diffusion,8,Safetensor versions,refs/pr/8,https://huggingface.co/0xJustin/Dungeons-and-D...,comment,Could someone make safetensor versions of each...,Safetensor versions Could someone make safeten...,1.0,6
103801,0xJustin/Dungeons-and-Diffusion,16,Which .ckpt-files to use?,,https://huggingface.co/0xJustin/Dungeons-and-D...,comment,"By default, use D&Diffusion3.0_Protogen-fp32.s...","Which .ckpt-files to use? By default, use D&Di...",1.0,6
104547,152334H/miqu-1-70b-sf,13,Model load fail,,https://huggingface.co/152334H/miqu-1-70b-sf/d...,comment,pip install safetensors,Model load fail pip install safetensors,1.0,6
104755,1bitLLM/bitnet_b1_58-3B,3,lm_head is missing in *.safetensors,,https://huggingface.co/1bitLLM/bitnet_b1_58-3B...,comment,I did not notice the tie_word_embedding is True.,lm_head is missing in *.safetensors I did not ...,1.0,6
...,...,...,...,...,...,...,...,...,...,...
104197,zuzhe/Ancient-Chinese-head-portrait,1,Add safetensors variant,refs/pr/1,https://huggingface.co/zuzhe/Ancient-Chinese-h...,comment,,Add safetensors variant,1.0,6
104146,zuzhe/Chinese-wedding,2,Add safetensors variant,refs/pr/2,https://huggingface.co/zuzhe/Chinese-wedding/d...,comment,,Add safetensors variant,1.0,6
104215,zuzhe/Mecha-model,1,Add safetensors variant,refs/pr/1,https://huggingface.co/zuzhe/Mecha-model/discu...,comment,,Add safetensors variant,1.0,6
104042,zyh3826/llama2-13b-ft-openllm-leaderboard-v1,5,Upload model.safetensors.index.json,refs/pr/5,https://huggingface.co/zyh3826/llama2-13b-ft-o...,comment,,Upload model.safetensors.index.json,1.0,6


In [6]:
gh_issues_overlap = get_overlap(
	folders=[
		"all_gh_issue_secroberta_tuned_all",
		"all_gh_issue_distilbert_tuned_all",
		"all_gh_issue_bert_tuned_all",
		"all_gh_issue_secbert_tuned_all",
		"all_gh_issue_roberta_tuned_all",
		"all_gh_issue_securebert_tuned_all",
	],
	columns=[
		"repo_name", "issue_url", "issue_title", "issue_body", "pr_from_issue",
		"user_login", "issue_number", "full_comment", "is_security_prediction",
	],
	export_to="./overlap/gh_issues_no_filtered.csv",
)

gh_issues_overlap.head(5)
# further processing to remove unrelated repos
gh_issues_overlap = gh_issues_overlap[
	~(
		(gh_issues_overlap["repo_name"] == "microsoft/vscode") |
		(gh_issues_overlap["repo_name"] == "MicrosoftDocs/azure-docs")
	)
]
print(len(gh_issues_overlap))
gh_issues_overlap.sort_values(by=["repo_name", "issue_number"], inplace=True)
gh_issues_overlap.to_csv("./overlap/gh_issues.csv", index=False)

len(df)=11968620
Security only len(df)=3438554
Unique value counts over columns len(df_counts)=1327938
Filtered value counts over columns len(df_counts)=47721
Drop duplicated over columns len(df_counts)=47721
35782


In [15]:
with pd.ExcelWriter("./overlap/gh.xlsx", engine='xlsxwriter',
					engine_kwargs={"options": {"strings_to_urls": False}}) as writer:
	gh_overlap.to_excel(writer)

hf_overlap = hf_overlap.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
with pd.ExcelWriter(
	"./overlap/hf.xlsx",
	engine='xlsxwriter',
	engine_kwargs={
		"options": {"strings_to_urls": False, "encoding": "utf-8"},
	},
) as writer:
	hf_overlap.to_excel(writer)

gh_issues_overlap = gh_issues_overlap.applymap(
	lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
with pd.ExcelWriter(
	"./overlap/gh_issues.xlsx",
	engine='xlsxwriter',
	engine_kwargs={"options": {"strings_to_urls": False, "encoding": "utf-8"}}
) as writer:
	gh_issues_overlap.to_excel(writer)

  hf_overlap = hf_overlap.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
  gh_issues_overlap = gh_issues_overlap.applymap(


In [20]:
test_99_done = pd.read_csv("./test/test_99_done.csv")
test_99_done = test_99_done.sort_values(by=["id_name", "id_num"])
test_99_done.to_csv("./test/test_99_done.csv", index=False)

# TEST SET LOOKUP

In [2]:
merged_all = pd.read_csv("./merged/merged_all.csv")

In [21]:
test_99_done = pd.read_csv("./test/test_99_done.csv")

In [22]:
print(len(test_99_done))
print(len(merged_all))

99
2708727


In [23]:
test_full = merged_all[
    merged_all.set_index(["id_name", "id_num", "type"]).index.isin(
        test_99_done.set_index(["id_name", "id_num", "type"]).index
    )
]
test_full.sort_values(by=["id_name", "id_num"], inplace=True)
test_full

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_full.sort_values(by=["id_name", "id_num"], inplace=True)


Unnamed: 0,id_name,id_num,type,content
778779,AlUlkesh/stable-diffusion-webui-images-browser,245,GH_DISCUSSIONS,"Gradio 4 As you might be aware, a1111 is worki..."
796045,AlUlkesh/stable-diffusion-webui-images-browser,245,GH_DISCUSSIONS,"Gradio 4 As you might be aware, a1111 is worki..."
2505503,AlUlkesh/stable-diffusion-webui-images-browser,245,GH_DISCUSSIONS,"Gradio 4 As you might be aware, a1111 is worki..."
915716,AlexeyAB/darknet,5520,GH_ISSUES,buffer overflow detected
1053928,ArrowLuo/CLIP4Clip,91,GH_ISSUES,CVE-2007-4559 Patch # Patching CVE-2007-4559\n...
...,...,...,...,...
2077657,yl4579/StyleTTS2,110,GH_DISCUSSIONS,"Gradio demo HI, congrats on StyleTT2, would be..."
2211049,yl4579/StyleTTS2,110,GH_DISCUSSIONS,"Gradio demo HI, congrats on StyleTT2, would be..."
2294254,yl4579/StyleTTS2,110,GH_DISCUSSIONS,"Gradio demo HI, congrats on StyleTT2, would be..."
2345942,yl4579/StyleTTS2,110,GH_DISCUSSIONS,"Gradio demo HI, congrats on StyleTT2, would be..."


In [24]:
print(test_full["type"].value_counts())
test_full["is_security"] = -1
test_full.to_csv("./test/test_full.csv", index=False)

type
HF_DISCUSSIONS    231
GH_DISCUSSIONS    159
GH_ISSUES          35
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_full["is_security"] = -1


In [25]:
test_full = pd.read_csv("./test/test_full.csv")
print(len(test_full[["id_num","id_name"]].drop_duplicates()))
print(len(test_full))

99
425


In [11]:
import pandas as pd 
test_full_done = pd.read_csv("./test/test_full_done.csv")
print(len(test_full_done[["id_num","id_name","is_security"]].drop_duplicates()))
print(len(test_full_done))

print(test_full_done[["id_num","id_name","is_security"]].drop_duplicates()["is_security"].value_counts())
print(test_full_done[["id_num","id_name","is_security","type"]].drop_duplicates().groupby("type")["is_security"].value_counts())

98
423
is_security
0    79
1    19
Name: count, dtype: int64
type            is_security
GH_DISCUSSIONS  0              27
                1               3
GH_ISSUES       0              26
                1               9
HF_DISCUSSIONS  0              26
                1               7
Name: count, dtype: int64


In [1]:
import pandas as pd 
gh_distilbert_disc = pd.read_csv("./target/chosen/gh_distilbert_disc.csv")

gh_distilbert_disc = gh_distilbert_disc.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
with pd.ExcelWriter(
	"./target/convert/gh_distilbert_disc.xlsx",
	engine='xlsxwriter',
	engine_kwargs={
		"options": {"strings_to_urls": False, "encoding": "utf-8"},
	},
) as writer:
	gh_distilbert_disc.to_excel(writer)


  gh_distilbert_disc = gh_distilbert_disc.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)


In [None]:
gh_distilbert_disc.csv

In [3]:
import pandas as pd
gh_issues_distilbert = pd.read_csv("./inference/all_gh_issue_distilbert_tuned_all_9_train/raw_predictions.csv")
gh_issue_kw = pd.read_csv("./manual/gh_issues_subset_3_done.csv")
gh_issue_kw = gh_issue_kw[gh_issue_kw["is_security"] == 1]
gh_issue_kw["repo_name_discussion_number"] = gh_issue_kw["repo_name"] + "_" + gh_issue_kw["issue_number"].astype(str)
 
gh_issues_distilbert_disc = gh_issues_distilbert.groupby(["repo_name", "issue_number", "full_comment",], as_index=False)["is_security_prediction"].max()
gh_issues_distilbert_disc = gh_issues_distilbert_disc[gh_issues_distilbert_disc["is_security_prediction"] == 1]
gh_issues_distilbert_disc["repo_name_discussion_number"] = gh_issues_distilbert_disc["repo_name"] + "_" + gh_issues_distilbert_disc["issue_number"].astype(str)
gh_issues_distilbert_disc["is_kw"] = gh_issues_distilbert_disc["repo_name_discussion_number"].isin(gh_issue_kw["repo_name_discussion_number"]).astype(int)
gh_issues_distilbert_disc["url"] = "https://github.com/" + gh_issues_distilbert_disc["repo_name"] + "/issues/" + gh_issues_distilbert_disc["issue_number"].astype(str)

gh_issues_distilbert_disc.to_csv("./target/gh_issues_distilbert_disc.csv", index=False)
gh_issues_distilbert_disc[gh_issues_distilbert_disc["is_kw"] == 1].to_csv("./target/gh_issues_distilbert_disc_kw.csv", index=False)
print(len(gh_issues_distilbert_disc))
gh_issues_distilbert_disc

605972


Unnamed: 0,repo_name,issue_number,full_comment,is_security_prediction,repo_name_discussion_number,is_kw,url
3,01-ai/Yi,2,Update license,1.0,01-ai/Yi_2,0,https://github.com/01-ai/Yi/issues/2
4,01-ai/Yi,3,Setup github action to build & push docker ima...,1.0,01-ai/Yi_3,0,https://github.com/01-ai/Yi/issues/3
74,01-ai/Yi,78,Missing pypi package in `requirements.txt` Got...,1.0,01-ai/Yi_78,0,https://github.com/01-ai/Yi/issues/78
79,01-ai/Yi,83,Update README.md typo,1.0,01-ai/Yi_83,0,https://github.com/01-ai/Yi/issues/83
82,01-ai/Yi,86,fix sft loss promlem according to discussion i...,1.0,01-ai/Yi_86,0,https://github.com/01-ai/Yi/issues/86
...,...,...,...,...,...,...,...
1994408,zzzDavid/ICDAR-2019-SROIE,20,building a .so files for CTPN method of task1 ...,1.0,zzzDavid/ICDAR-2019-SROIE_20,0,https://github.com/zzzDavid/ICDAR-2019-SROIE/i...
1994414,zzzyuqing/DreamMat,4,No 'from_unet_inchannel' functions in ControlN...,1.0,zzzyuqing/DreamMat_4,0,https://github.com/zzzyuqing/DreamMat/issues/4
1994422,zzzyuqing/DreamMat,12,Error during installing dependencies # Env\r\n...,1.0,zzzyuqing/DreamMat_12,0,https://github.com/zzzyuqing/DreamMat/issues/12
1994424,zzzyuqing/DreamMat,14,CUDA not found in docker image I pulled the of...,1.0,zzzyuqing/DreamMat_14,0,https://github.com/zzzyuqing/DreamMat/issues/14
