In [None]:
import requests
import pandas as pd
import time
import base64
import concurrent.futures
import xml.etree.ElementTree as ET
import markdown
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.types import Text, String, DateTime


kipris_key = "L/Ik7Sh0vMzeH5204ZB4D1pIFAdCplRSS2BqNpQvn8w="

headers = {"Authorization":f"Bearer {git_token}",
           "Accept": "application/vnd.github+json"}

# keyword = ["robotics", "ROS", "robot arm", "robot", "amr"]
keyword = ["ROS"]

delay = 2

rds_user = "root"
rds_pwd = "ckehdgus9v99nw!"
rds_host = "database-1.ctai4u0ayj03.ap-northeast-2.rds.amazonaws.com"
rds_port = 3306
rds_db = "amrbase"

repo_table_name = "github_repo"
kipris_table_name = "kipris_patent"

res = requests.get("https://api.github.com/user", headers=headers)

Kipris_serch_url = "http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch"

per_page = 50
max_page = 3

Default_total_target = 1000
Max_row_per_page = 500

In [12]:
def search_repos(keyword, language=None):
    repos = []
    for page in range(1, max_page + 1):
        query = keyword
        if language:
            query += f" language:{language}"
        url = "https://api.github.com/search/repositories"
        params = {"q": query, "sort": "stars", "order": "desc", "per_page": per_page, "page": page}
        r = requests.get(url, headers=headers, params=params)
        if r.status_code != 200:
            print(f"[Warning] 요청 실패 {r.status_code}")
            break
        batch = r.json().get("items", [])
        if not batch:
            break
        repos.extend(batch)
        # time.sleep(delay)
    return repos

In [13]:
def serch_kipris(word_query,
                 service_key,
                 exclude_terms=None,
                 total_target=1000,
                 rows_per_page=500):
    K_result = []
    total_pages = (total_target + rows_per_page -1) // rows_per_page

    if exclude_terms is None:
        exclude_terms = []
    
    for page in range(1, total_pages + 1):
        kurl = (
            f"{Kipris_serch_url}"
            f"?word={word_query}"
            f"&numOfRows={rows_per_page}"
            f"&pageNo={page}"
            f"&ServiceKey={service_key}"
        )

        response = requests.get(kurl)
        response.raise_for_status()

        root = ET.fromstring(response.content)
        items = root.findall(".//item")

        for item in items:
            inventionTitle = item.findtext("inventionTitle", default="").strip()
            astrtCont = item.findtext("astrtCont", default="").strip()

            if any(term.lower() in astrtCont.lower() or term.lower() in inventionTitle.lower() for term in exclude_terms):
                continue

            K_result.append({
                "inventionTitle":item.findtext("inventionTitle", default=""),
                "astrtCont":item.findtext("astrtCont", default="").strip(),
                "registerDate":item.findtext("registerDate", default=""),
                "keyword":word_query
            })

        if len(items) < rows_per_page:
            break

        print(f"[Info] {page}페이지 완료 ({len(items)}건, 누적 {len(K_result)}건)")
        time.sleep(1.0)

    return K_result[:total_target]


In [14]:
def get_repo_stats(owner, repo_name):
    url = f"https://api.github.com/repos/{owner}/{repo_name}"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return {"commits": None, "contributors": None, "open_issues": None}
    repo = r.json()

    # 커밋 수 추정
    commits_url = f"https://api.github.com/repos/{owner}/{repo_name}/commits?per_page=1"
    commits_res = requests.get(commits_url, headers=headers)
    commits = None
    if "Link" in commits_res.headers:
        try:
            last_page = [l for l in commits_res.headers["Link"].split(",") if 'rel="last"' in l][0]
            commits = int(last_page.split("page=")[-1].split(">")[0])
        except:
            commits = None
    else:
        commits = len(commits_res.json())

    # 기여자 수 추정
    contrib_url = f"https://api.github.com/repos/{owner}/{repo_name}/contributors?per_page=1&anon=true"
    contrib_res = requests.get(contrib_url, headers=headers)
    contributors = None
    if "Link" in contrib_res.headers:
        try:
            last_page = [l for l in contrib_res.headers["Link"].split(",") if 'rel="last"' in l][0]
            contributors = int(last_page.split("page=")[-1].split(">")[0])
        except:
            contributors = None
    else:
        contributors = len(contrib_res.json())

    # 오픈 이슈 수
    open_issues = repo.get("open_issues_count", None)

    return {"commits": commits, "contributors": contributors, "open_issues": open_issues}

In [15]:
def get_readme(owner, repo, token=None):
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token :
        headers["Authorization"] = f"token {token}"

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        content = base64.b64decode(data["content"].encode("utf-8")).decode("utf-8", errors="ignore")
        html = markdown.markdown(content)
        text = BeautifulSoup(html, "html.parser").get_text(separator="\n")
        print(f"{owner}/{repo}'s readme check done!")
        return text.strip()
    else:
        print(f"Failed to fetch Readme({response.status_code}): {response.text}")

In [16]:
def fetch_repo_stats(repo, token):
    owner, name = repo["owner"]["login"], repo["name"]
    # stats = get_repo_stats(owner, name)
    readme = get_readme(owner, name, token)
    return {
        "full_name": repo["full_name"],
        # "description": repo["description"],
        "stars": repo["stargazers_count"],
        "forks": repo["forks_count"],
        "language": repo["language"],
        # "html_url": repo["html_url"],
        "updated_at": repo["updated_at"],
        # "commits": stats["commits"],
        # "contributors": stats["contributors"],
        # "open_issues": stats["open_issues"],
        "readme": readme
    }

In [17]:
engine_url = f"mysql+pymysql://{rds_user}:{rds_pwd}@{rds_host}:{rds_port}/{rds_db}"
engine = create_engine(engine_url)

In [18]:
all_repos = []
for kw in keyword:
    print(f"Searching keyword: {kw}")
    result = search_repos(kw)
    all_repos.extend(result)

# full_name 기준 중복 제거
unique_repos = {repo["full_name"]: repo for repo in all_repos}
repos_list = list(unique_repos.values())
print(f"총 {len(repos_list)}개의 고유 레포 수집 완료")

Searching keyword: ROS
총 150개의 고유 레포 수집 완료


In [None]:
repo_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # 각 레포에 fetch_repo_stats를 병렬 실행
    results = list(executor.map(fetch_repo_stats, repos_list, [git_token]*len(repos_list)))

repo_data.extend(results)
df = pd.DataFrame(repo_data)

facontidavide/PlotJuggler's readme check done!
PX4/PX4-Autopilot's readme check done!
ClemensElflein/OpenMower's readme check done!
emilybache/GildedRose-Refactoring-Kata's readme check done!
autowarefoundation/autoware's readme check done!
ros2/ros2's readme check done!
carla-simulator/carla's readme check done!
rosedblabs/rosedb's readme check done!
dotnet/roslyn's readme check done!
ArduPilot/ardupilot's readme check done!
cyberbotics/webots's readme check done!
MichaelGrupp/evo's readme check done!
BehaviorTree/BehaviorTree.CPP's readme check done!
introlab/rtabmap's readme check done!
ros-navigation/navigation2's readme check done!
ros/ros's readme check done!
ai-winter/ros_motion_planning's readme check done!
soundcloud/roshi's readme check done!
dotnet/roslynator's readme check done!
Ly0n/awesome-robotic-tooling's readme check done!
mithi/robotics-coursework's readme check done!
roslynpad/roslynpad's readme check done!
ANYbotics/grid_map's readme check done!
IntelRealSense/reals

In [24]:
df.to_sql(repo_table_name, con=engine, if_exists="replace", index=False, chunksize=100)
print(f"{repo_table_name} 테이블에 데이터 저장 완료")

github_repo 테이블에 데이터 저장 완료


In [56]:
all_patent = []
exclude_term = ["화학", "반응성", "화합", "제어능", "활성", "미토콘트리아", "단백질", "치료", "약물", "효소"]

for kw in keyword:
    print(f"Searching keyword: {kw}")
    result = serch_kipris(kw, kipris_key, exclude_term)
    all_patent.extend(result)

kdf = pd.DataFrame(all_patent, columns=["inventionTitle", "astrtCont", "registerDate", "keyword"])

kdf.to_sql(kipris_table_name,
           con=engine,
           if_exists="replace",
           index=False,
           chunksize=100,
           dtype={
               "inventionTitle": String(500),
               "astrtCont": Text,
               "registerDate": String(20),
               "keyword": String(100)
           })
print(f"{kipris_table_name} 테이블에 데이터 저장 완료")

Searching keyword: ROS
[Info] 1페이지 완료 (500건, 누적 160건)
[Info] 2페이지 완료 (500건, 누적 233건)
kipris_patent 테이블에 데이터 저장 완료


In [None]:
df

In [25]:
df.to_csv("~/devgithub_repo.csv", index=False)

In [1]:
kdf.to_csv("~/devkipris_patent.csv", index=False)

NameError: name 'kdf' is not defined