In [14]:
import requests
import pandas as pd
import time
import base64
import concurrent.futures
import xml.etree.ElementTree as ET
import markdown
import re
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.types import Text, String, DateTime

load_dotenv()

CONFIG ={
    "git_token":os.getenv("GIT_TOKEN"),
    "kipris_key":os.getenv("KIPRIS_KEY"),
    "rds_user":os.getenv("DB_USER"),
    "rds_pwd":os.getenv("DB_PASSWORD"),
    "rds_host":os.getenv("DB_HOST"),
    "rds_db":os.getenv("trendbot"),
    "rds_port":os.getenv("DB_PORT")
}

git_token = CONFIG["git_token"]
kipris_key = CONFIG["kipris_key"]
rds_user = CONFIG["rds_user"]
rds_pwd = CONFIG["rds_pwd"]
rds_host = CONFIG["rds_host"]
rds_db = CONFIG["rds_db"]
rds_port = CONFIG["rds_port"]

headers = {"Authorization":f"Bearer {git_token}",
           "Accept": "application/vnd.github+json"}

keyword = ["robotics", "ROS", "robot arm", "robot", "amr"]
# keyword = ["ROS"]

delay = 2

repo_table_name = "github_repo"
read_table_name = "github_readmes"
kipris_table_name = "kipris_patent"

res = requests.get("https://api.github.com/user", headers=headers)

Kipris_serch_url = "http://plus.kipris.or.kr/kipo-api/kipi/patUtiModInfoSearchSevice/getAdvancedSearch"

per_page = 50
max_page = 3

Default_total_target = 1000
Max_row_per_page = 500

In [8]:
def search_repos(keyword, language=None):
    repos = []
    for page in range(1, max_page + 1):
        query = keyword
        if language:
            query += f" language:{language}"
        url = "https://api.github.com/search/repositories"
        params = {"q": f"{query} stars:>=100",
                  "sort": "stars",
                  "order": "desc",
                  "per_page": per_page,
                  "page": page}
        r = requests.get(url, headers=headers, params=params)
        
        if r.status_code != 200:
            print(f"[Warning] 요청 실패 {r.status_code}")
            break
        
        batch = r.json().get("items", [])
        
        if not batch:
            break

        for repo in batch:
            desc = repo.get("description") or ""
            if not re.search(r"[\u4e00-\u9fff]", desc):
                repos.append(repo)
        # time.sleep(delay)
    return repos

In [9]:
def get_readme(owner, repo, token=None):
    url = f"https://api.github.com/repos/{owner}/{repo}/readme"
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token :
        headers["Authorization"] = f"token {token}"

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        data = response.json()
        content = base64.b64decode(data["content"].encode("utf-8")).decode("utf-8", errors="ignore")
        
        try:
            html = markdown.markdown(content)
            text = BeautifulSoup(html, "html.parser").get_text(separator="\n")
            return text.strip()
        except Exception as e:
            print(f"{owner}/{repo}: Markdown -> Text 변환 실패 {e})")
            return content
        
    except requests.exceptions.HTTPError as e:
        print(f"{owner}/{repo}: README 없음 ({response.status_code})")
        return "None"
    
    except Exception as e:
        print(f"{owner}/{repo} README 디코드 실패: {e}")
        return "None"

In [None]:
def fetch_repo_stats(repo, token):
    owner, name = repo["owner"]["login"], repo["name"]
    readme = get_readme(owner, name, token)
    return {
        "full_name": repo["full_name"],
        "stars": repo["stargazers_count"],
        "forks": repo["forks_count"],
        "language": repo["language"],
        "updated_at": repo["updated_at"],
        "readme": readme
    }

In [11]:
engine_url = f"mysql+pymysql://{rds_user}:{rds_pwd}@{rds_host}:{rds_port}/{rds_db}"
engine = create_engine(engine_url)

In [None]:
all_repos = []
for kw in keyword:
    print(f"Searching keyword: {kw}")
    result = search_repos(kw)
    all_repos.extend(result)

# full_name 기준 중복 제거
unique_repos = {repo["full_name"]: repo for repo in all_repos}
repos_list = list(unique_repos.values())
print(f"총 {len(repos_list)}개의 고유 레포 수집 완료")

In [None]:
repo_data = []
total_repos = len(repos_list)

for i, repo in enumerate(repos_list, start=1):
    try:
        result = fetch_repo_stats(repo, git_token)
        if result:
            repo_data.append(result)
            print(f"[{i}/{total_repos}] {result['full_name']} 완료")
    except Exception as e:
        print(f"[{i}/{total_repos}] 오류 발생: {e}")
        continue

print("모든 README 수집 완료")

# DataFrame 생성
all_df = pd.DataFrame(repo_data)

df_meta = all_df[["full_name", "stars", "forks", "language", "updated_at"]]
df_read = all_df[["full_name", "readme"]]

In [None]:
df_meta.to_sql(repo_table_name, con=engine, if_exists="replace", index=False, chunksize=100)
print(f"{repo_table_name} 테이블에 데이터 저장 완료")
df_read.to_sql(read_table_name, con=engine, if_exists="replace", index=False, chunksize=100)
print(f"{read_table_name} 테이블에 데이터 저장 완료")

In [None]:
def serch_kipris(word_query,
                 service_key,
                 exclude_terms=None,
                 total_target=1000,
                 rows_per_page=500):
    K_result = []
    total_pages = (total_target + rows_per_page -1) // rows_per_page

    if exclude_terms is None:
        exclude_terms = []
    
    for page in range(1, total_pages + 1):
        kurl = (
            f"{Kipris_serch_url}"
            f"?word={word_query}"
            f"&numOfRows={rows_per_page}"
            f"&pageNo={page}"
            f"&ServiceKey={service_key}"
        )

        response = requests.get(kurl)
        response.raise_for_status()

        try:        
            root = ET.fromstring(response.content)
        except ET.ParseError:
            print(f"XML 파싱 실패: Page={page}, keyword={word_query}")
            continue
        
        items = root.findall(".//item")

        for item in items:
            inventionTitle = item.findtext("inventionTitle", default="").strip()
            astrtCont = item.findtext("astrtCont", default="").strip()

            if any(term.lower() in astrtCont.lower() or term.lower() in inventionTitle.lower() for term in exclude_terms):
                continue

            K_result.append({
                "inventionTitle":item.findtext("inventionTitle", default=""),
                "astrtCont":item.findtext("astrtCont", default="").strip(),
                "applicationDate":item.findtext("applicationDate", default=""),
                "registerDate":item.findtext("registerDate", default=""),
                "applicationDate":item.findtext("applicationDate", default=""),
                "keyword":word_query
            })

        if len(items) < rows_per_page:
            break

        print(f"[Info] {page}페이지 완료 ({len(items)}건, 누적 {len(K_result)}건)")
        time.sleep(1.0)

    return K_result[:total_target]


In [None]:
all_patent = []
exclude_term = ["화학", "반응성", "화합", "제어능", "활성", "미토콘트리아", "단백질", "치료", "약물", "효소"]

for kw in keyword:
    print(f"Searching keyword: {kw}")
    result = serch_kipris(kw, kipris_key, exclude_term)
    all_patent.extend(result)
    print(f"{kw} 완료 (누적 {len(all_patent)}건)")

kdf = pd.DataFrame(all_patent, columns=["inventionTitle", "astrtCont", "applicationDate", "registerDate", "applicationDate","keyword"])

kdf.to_sql(kipris_table_name,
           con=engine,
           if_exists="replace",
           index=False,
           chunksize=100,
           dtype={
               "inventionTitle": String(500),
               "astrtCont": Text,
               "applicationDate": String(20),
               "registerDate": String(20),
               "keyword": String(100)
           })
print(f"{kipris_table_name} 테이블에 데이터 저장 완료")

In [None]:
df_meta.to_csv("~/github_repo.csv", index=False)
df_read.to_csv("~/github_readmes.csv", index=False)
kdf.to_csv("~/kipris_patent.csv", index=False)