In [31]:
import requests
import pandas as pd
import time
import base64
import concurrent.futures
import xml.etree.ElementTree as ET
import markdown
import re
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from sqlalchemy import create_engine, text
from sqlalchemy.types import Text, String, DateTime
from tqdm import tqdm

load_dotenv()

CONFIG ={
    "git_token":os.getenv("GIT_TOKEN"),
    "kipris_key":os.getenv("KIPRIS_KEY"),
    "rds_user":os.getenv("DB_USER"),
    "rds_pwd":os.getenv("DB_PASSWORD"),
    "rds_host":os.getenv("DB_HOST"),
    "rds_db":os.getenv("DB_NAME"),
    "rds_port":os.getenv("DB_PORT")
}

git_token = CONFIG["git_token"]
kipris_key = CONFIG["kipris_key"]
rds_user = CONFIG["rds_user"]
rds_pwd = CONFIG["rds_pwd"]
rds_host = CONFIG["rds_host"]
rds_db = CONFIG["rds_db"]
rds_port = CONFIG["rds_port"]

headers = {"Authorization":f"Bearer {git_token}",
           "Accept": "application/vnd.github+json"}

keyword = ["robotics", "ROS", "robot arm", "robot", "amr"]
# keyword = ["ROS"]

delay = 2

repo_table_name = "github_repo"
read_table_name = "github_readmes"
kipris_table_name = "kipris_patent"

res = requests.get("https://api.github.com/user", headers=headers)

Kipris_serch_url = "http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch"

per_page = 50
max_page = 3

Default_total_target = 1000
Max_row_per_page = 500

In [44]:
def search_repos(keyword, language=None):
    repos = []
    for page in range(1, max_page + 1):
        query = keyword
        if language:
            query += f" language:{language}"
        url = "https://api.github.com/search/repositories"
        params = {"q": f"{query} stars:>=1000",
                  "sort": "stars",
                  "order": "desc",
                  "per_page": per_page,
                  "page": page}
        r = requests.get(url, headers=headers, params=params)
        
        if r.status_code != 200:
            print(f"[Warning] 요청 실패 {r.status_code}")
            break
        
        batch = r.json().get("items", [])
        
        if not batch:
            break

        for repo in batch:
            desc = repo.get("description") or ""
            if not (re.search(r"[\u4e00-\u9fff]", desc)):
                repo["keyword"] = keyword
                repos.append(repo)
    return repos

In [33]:
def get_readme(owner, repo, token=None):
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token :
        headers["Authorization"] = f"token {token}"

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        data = response.json()
        content = base64.b64decode(data["content"].encode("utf-8")).decode("utf-8", errors="ignore")
        
        try:
            html = markdown.markdown(content)
            text = BeautifulSoup(html, "html.parser").get_text(separator="\n")
            return text.strip()
        except Exception as e:
            print(f"{owner}/{repo}: Markdown -> Text 변환 실패 {e})")
            return content
        
    except requests.exceptions.HTTPError as e:
        print(f"{owner}/{repo}: README 없음 ({response.status_code})")
        return "None"
    
    except Exception as e:
        print(f"{owner}/{repo} README 디코드 실패: {e}")
        return "None"

In [50]:
def fetch_repo_stats(repo, token):
    owner, name = repo["owner"]["login"], repo["name"]
    readme = get_readme(owner, name, token)
    return {
        "full_name": repo["full_name"],
        "stars": repo["stargazers_count"],
        "forks": repo["forks_count"],
        "language": repo["language"],
        "updated_at": repo["updated_at"],
        "created_at": repo["created_at"],
        "readme": readme,
        "keyword": repo["keyword"]
    }

In [22]:
def safe_request(url, retries=3, delay=2, timeout=30):
    """안정적인 GET 요청 (타임아웃/재시도 포함)"""
    for attempt in range(1, retries + 1):
        try:
            response = requests.get(url, timeout=timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"요청 타임아웃 (재시도 {attempt}/{retries})")
        except requests.exceptions.ConnectionError:
            print(f"연결 오류 (재시도 {attempt}/{retries})")
        except requests.exceptions.RequestException as e:
            print(f"요청 실패 ({type(e).__name__}): {e}")
            break
        time.sleep(delay)
    return None

In [None]:
def serch_kipris(word_query,
                 service_key,
                 exclude_terms=None,
                 total_target=1000,
                 rows_per_page=100):
    K_result = []
    total_pages = (total_target + rows_per_page - 1) // rows_per_page

    if exclude_terms is None:
        exclude_terms = []

    for page in range(1, total_pages + 1):
        kurl = (
            f"{Kipris_serch_url}"
            f"?word={word_query}"
            f"&numOfRows={rows_per_page}"
            f"&pageNo={page}"
            f"&ServiceKey={service_key}"
        )

        # print(kurl)

        response = safe_request(kurl)
        if response is None:
            print(f"요청 실패 (keyword={word_query}, page={page})")
            continue

        try:
            root = ET.fromstring(response.content)
        except ET.ParseError:
            print(f"XML 파싱 실패 (keyword={word_query}, page={page})")
            continue

        items = root.findall(".//item")
        if not items:
            print(f"결과 없음 (keyword={word_query}, page={page})")
            break

        for item in items:
            inventionTitle = item.findtext("inventionTitle", default="").strip()
            astrtCont = item.findtext("astrtCont", default="").strip()

            # 제외 키워드 필터링
            if any(term.lower() in astrtCont.lower() or term.lower() in inventionTitle.lower()
                   for term in exclude_terms):
                continue

            K_result.append({
                "inventionTitle": inventionTitle,
                "astrtCont": astrtCont,
                "applicationDate": item.findtext("applicationDate", default=""),
                "registerDate": item.findtext("registerDate", default=""),
                "keyword": word_query
            })

        print(f"[Info] {word_query} - {page}페이지 완료 ({len(items)}건, 누적 {len(K_result)}건)")
        time.sleep(1)

        # 마지막 페이지면 중단
        if len(items) < rows_per_page:
            break

    return K_result[:total_target]

In [37]:
def normalize_date(date_str):
    if not date_str:
        return None
        
    date_str = date_str.strip()

    if len(date_str) == 8:
        return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
    
    return date_str

In [25]:
engine_url = f"mysql+pymysql://{rds_user}:{rds_pwd}@{rds_host}:{rds_port}/{rds_db}"
engine = create_engine(engine_url)

In [51]:
all_repos = []
for kw in keyword:
    print(f"Searching keyword: {kw}")
    result = search_repos(kw)
    all_repos.extend(result)

# full_name 기준 중복 제거
unique_repos = {repo["full_name"]: repo for repo in all_repos}
repos_list = list(unique_repos.values())
print(f"총 {len(repos_list)}개의 고유 레포 수집 완료")

Searching keyword: robotics
Searching keyword: ROS
Searching keyword: robot arm
Searching keyword: robot
Searching keyword: amr
총 264개의 고유 레포 수집 완료


In [52]:
repo_data = []
total_repos = len(repos_list)

for repo in tqdm(repos_list, total=total_repos, desc="Readme 수집 진행 중"):
    try:
        result = fetch_repo_stats(repo, git_token)
        if result:
            repo_data.append(result)
    except Exception as e:
        tqdm.write(f"오류발생 : {e}")
        continue

print("모든 README 수집 완료")

# DataFrame 생성
all_df = pd.DataFrame(repo_data)

df_meta = all_df[["full_name", "stars", "forks", "language", "updated_at", "created_at"]]
df_read = all_df[["full_name", "readme", "keyword"]]

Readme 수집 진행 중: 100%|██████████| 264/264 [01:52<00:00,  2.34it/s]

모든 README 수집 완료





In [53]:
df_meta.to_sql(repo_table_name, con=engine, if_exists="replace", index=False, chunksize=100)
print(f"{repo_table_name} 테이블에 데이터 저장 완료")
df_read.to_sql(read_table_name, con=engine, if_exists="replace", index=False, chunksize=100)
print(f"{read_table_name} 테이블에 데이터 저장 완료")

github_repo 테이블에 데이터 저장 완료
github_readmes 테이블에 데이터 저장 완료


In [49]:
all_patent = []
exclude_term = ["화학", "반응성", "화합", "제어능", "활성", "미토콘트리아", "단백질", "치료", "약물", "효소", "활성산소", "산화", "세포", "화합물", "의약", "반응", "검침", "통신망", "녹화"]
create_table = """
CREATE TABLE IF NOT EXISTS kipris_patent (
    id INT AUTO_INCREMENT PRIMARY KEY,
    inventionTitle VARCHAR(500) NOT NULL,
    astrtCont TEXT,
    registerDate DATE,
    applicationDate DATE NOT NULL,
    keyword VARCHAR(1000),
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
    INDEX idx_inventionTitle (inventionTitle(255)),
    INDEX idx_registerDate (registerDate),
    INDEX idx_applicationDate (applicationDate)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;"""

for kw in tqdm(keyword, desc="Kipris 특허 검색 진행 중"):
    result = serch_kipris(kw, kipris_key, exclude_term)
    all_patent.extend(result)

kdf = pd.DataFrame(all_patent, columns=["inventionTitle", "astrtCont", "applicationDate", "registerDate", "keyword"])

kdf["applicationDate"] = kdf["applicationDate"].apply(normalize_date)
kdf["registerDate"] = kdf["registerDate"].apply(normalize_date)

with engine.connect() as conn:
    conn.execute(text(create_table))
    conn.commit()

kdf.to_sql(kipris_table_name,
           con=engine,
           if_exists="replace",
           index=False,
           chunksize=100,
           dtype={
               "inventionTitle": String(500),
               "astrtCont": Text,
               "applicationDate": String(20),
               "registerDate": String(20),
               "keyword": String(1000)
           })

print(f"{kipris_table_name} 테이블에 데이터 저장 완료")

Kipris 특허 검색 진행 중:   0%|          | 0/5 [00:00<?, ?it/s]

http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robotics&numOfRows=100&pageNo=1&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robotics - 1페이지 완료 (100건, 누적 100건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robotics&numOfRows=100&pageNo=2&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robotics - 2페이지 완료 (100건, 누적 188건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robotics&numOfRows=100&pageNo=3&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robotics - 3페이지 완료 (100건, 누적 285건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robotics&numOfRows=100&pageNo=4&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robotics - 4페이지 완료 (100건, 누적 380건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robotics&numOfRows=100&pageNo=5&ServiceK

Kipris 특허 검색 진행 중:  20%|██        | 1/5 [00:17<01:10, 17.62s/it]

http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=ROS&numOfRows=100&pageNo=1&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] ROS - 1페이지 완료 (100건, 누적 38건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=ROS&numOfRows=100&pageNo=2&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] ROS - 2페이지 완료 (100건, 누적 112건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=ROS&numOfRows=100&pageNo=3&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] ROS - 3페이지 완료 (100건, 누적 117건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=ROS&numOfRows=100&pageNo=4&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] ROS - 4페이지 완료 (100건, 누적 126건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=ROS&numOfRows=100&pageNo=5&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI

Kipris 특허 검색 진행 중:  40%|████      | 2/5 [00:35<00:52, 17.58s/it]

http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot arm&numOfRows=100&pageNo=1&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robot arm - 1페이지 완료 (14건, 누적 14건)


Kipris 특허 검색 진행 중:  60%|██████    | 3/5 [00:36<00:20, 10.10s/it]

http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot&numOfRows=100&pageNo=1&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robot - 1페이지 완료 (100건, 누적 94건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot&numOfRows=100&pageNo=2&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robot - 2페이지 완료 (100건, 누적 186건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot&numOfRows=100&pageNo=3&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robot - 3페이지 완료 (100건, 누적 285건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot&numOfRows=100&pageNo=4&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] robot - 4페이지 완료 (100건, 누적 377건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=robot&numOfRows=100&pageNo=5&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLe

Kipris 특허 검색 진행 중:  80%|████████  | 4/5 [00:54<00:13, 13.09s/it]

http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=amr&numOfRows=100&pageNo=1&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] amr - 1페이지 완료 (100건, 누적 69건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=amr&numOfRows=100&pageNo=2&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] amr - 2페이지 완료 (100건, 누적 151건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=amr&numOfRows=100&pageNo=3&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] amr - 3페이지 완료 (100건, 누적 223건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=amr&numOfRows=100&pageNo=4&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI=
[Info] amr - 4페이지 완료 (100건, 누적 304건)
http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch?word=amr&numOfRows=100&pageNo=5&ServiceKey=K0WMv2sWkgxTRAJX1/nUamZLeeYb7ZonOsIBk3tNxRI

Kipris 특허 검색 진행 중: 100%|██████████| 5/5 [01:11<00:00, 14.39s/it]


kipris_patent 테이블에 데이터 저장 완료


In [None]:
df_meta.to_csv("~/github_repo.csv", index=False)
df_read.to_csv("~/github_readmes.csv", index=False)
kdf.to_csv("~/kipris_patent.csv", index=False)