In [84]:
import requests
from bs4 import BeautifulSoup
import csv
from pathlib import Path
import pandas as pd
from typing import TypedDict, List, Set, Optional, Dict, Any



In [85]:
# 크롤링할 URL
url: str = "https://share.google/kcf0Fvu9AAxOrsieb"



# 웹페이지 요청
response: requests.Response = requests.get(url)
response.raise_for_status()  # HTTP 에러 체크

print(f"상태 코드: {response.status_code}")
print(f"페이지 크기: {len(response.text)} bytes")


상태 코드: 200
페이지 크기: 806883 bytes


In [None]:
# 데이터 구조 정의
class LibraryData(TypedDict):
    name: str
    url: str
    description: str
    category: str

# BeautifulSoup으로 HTML 파싱
soup: BeautifulSoup = BeautifulSoup(response.text, 'html.parser')

# 데이터를 저장할 리스트
crawled_data: List[LibraryData] = []
seen_urls: Set[str] = set()  

# 기사 제목 추출
title: Optional[Any] = soup.find('h1')
if title:
    title_text: str = title.get_text(strip=True)
    print(f"기사 제목: {title_text}")

# 모든 헤딩을 찾아서 섹션별로 처리
headings: List[Any] = soup.find_all(['h2', 'h3', 'h4'])

current_category: Optional[str] = None
current_subcategory: Optional[str] = None

for heading in headings:
    heading_text: str = heading.get_text(strip=True)
    
    # 메인 카테고리 추출
    if 'Top 10' in heading_text:
        if 'General use' in heading_text:
            current_category = 'Top 10 - General use'
            current_subcategory = None
        elif 'AI/ML/Data' in heading_text or 'AI' in heading_text:
            current_category = 'Top 10 - AI/ML/Data'
            current_subcategory = None
        else:
            current_category = heading_text
        print(f"\n카테고리 발견: {current_category}")
    
    elif 'Runners-up' in heading_text:
        if 'General use' in heading_text:
            current_category = 'Runners-up - General use'
        elif 'AI/ML/Data' in heading_text or 'AI' in heading_text:
            current_category = 'Runners-up - AI/ML/Data'
        else:
            current_category = heading_text
        current_subcategory = None
        print(f"\n카테고리 발견: {current_category}")
    
    elif 'Long tail' in heading_text:
        current_category = 'Long tail'
        current_subcategory = None
        print(f"\n카테고리 발견: {current_category}")
    
    # 헤딩 다음의 내용에서 라이브러리 정보 추출
    # 헤딩 다음부터 다음 헤딩 전까지의 모든 요소 수집
    current: Optional[Any] = heading.next_sibling
    section_elements: List[Any] = []
    
    while current:
        if current.name and current.name in ['h2', 'h3', 'h4']:
            break  # 다음 섹션 시작
        if hasattr(current, 'name') and current.name:
            section_elements.append(current)
        current = current.next_sibling
    
    # 섹션 내에서 라이브러리 정보 추출
    for element in section_elements:
        if not hasattr(element, 'find_all'):
            continue
            
        # 모든 단락과 리스트 아이템에서 정보 추출
        paragraphs: List[Any] = element.find_all(['p', 'li', 'div'], recursive=False)
        
        for para in paragraphs:
            # GitHub 링크 찾기
            links: List[Any] = para.find_all('a', href=True)
            
            for link in links:
                href: str = link.get('href', '')
                link_text: str = link.get_text(strip=True)
                
                # GitHub 링크인지 확인
                if 'github.com' in href or 'github.io' in href:
                    if href in seen_urls:
                        continue
                    seen_urls.add(href)
                    
                    # 설명 추출
                    # 링크가 포함된 전체 단락 텍스트 가져오기
                    para_text: str = para.get_text(separator=' ', strip=True)
                    
                    # 링크 텍스트를 제거하고 설명 추출
                    description: str = para_text.replace(link_text, '').strip()
                    
                    # 불필요한 공백 정리
                    description = ' '.join(description.split())
                    
                    # 순위 정보 (Top 10인 경우에만)
                    
                    crawled_data.append(LibraryData(
                        name=link_text,
                        url=href,
                        description=description,
                        category=current_category if current_category else '',
                    ))

print(f"\n추출된 데이터 개수: {len(crawled_data)}")
print(f"\n첫 5개 데이터 미리보기:")
for i, item in enumerate(crawled_data[:5], 1):
    print(f"\n{i}. 이름: {item['name']}")
    print(f"   URL: {item['url']}")
    print(f"   카테고리: {item['category']}")
    desc_preview: str = item['description'][:150] + '...' if len(item['description']) > 150 else item['description']
    print(f"   설명: {desc_preview}")

df: pd.DataFrame = pd.DataFrame(crawled_data)
df


카테고리 발견: Top 10 - General use

카테고리 발견: Top 10 - AI/ML/Data

카테고리 발견: Runners-up - General use

카테고리 발견: Runners-up - AI/ML/Data

카테고리 발견: Long tail

추출된 데이터 개수: 168

첫 5개 데이터 미리보기:

1. 이름: AuthTuna
   URL: https://github.com/shashstormer/AuthTuna
   카테고리: Runners-up - General use
   설명: – Security framework designed for modern async Python applications with first-class FastAPI support but framework-agnostic core capabilities. Features...

2. 이름: FastRTC
   URL: https://github.com/gradio-app/fastrtc
   카테고리: Runners-up - General use
   설명: – Real-time communication library that transforms Python functions into audio and video streams over WebRTC or WebSockets. Features automatic voice de...

3. 이름: hexora
   URL: https://github.com/rushter/hexora
   카테고리: Runners-up - General use
   설명: – Static analysis tool specifically designed to identify malicious and harmful patterns in Python code for security auditing purposes. Features over 3...

4. 이름: opentemplate
   URL: https://github.com

Unnamed: 0,name,url,description,category
0,AuthTuna,https://github.com/shashstormer/AuthTuna,– Security framework designed for modern async...,Runners-up - General use
1,FastRTC,https://github.com/gradio-app/fastrtc,– Real-time communication library that transfo...,Runners-up - General use
2,hexora,https://github.com/rushter/hexora,– Static analysis tool specifically designed t...,Runners-up - General use
3,opentemplate,https://github.com/open-nudge/opentemplate,– All-in-one Python project template that prov...,Runners-up - General use
4,PyByntic,https://github.com/sijokun/PyByntic,– Extension to Pydantic that enables binary se...,Runners-up - General use
...,...,...,...,...
163,Lazy Ninja,https://github.com/AghastyGD/lazy-ninja,Category Library GitHub Stars Description AI A...,Long tail
164,panel-material-ui,https://github.com/panel-extensions/panel-mate...,Category Library GitHub Stars Description AI A...,Long tail
165,pyeasydeploy,https://github.com/offerrall/pyeasydeploy,Category Library GitHub Stars Description AI A...,Long tail
166,Python Hiccup,https://github.com/DavidVujic/python-hiccup,Category Library GitHub Stars Description AI A...,Long tail


In [87]:
# CSV 파일로 저장
output_dir: Path = Path('./')
output_dir.mkdir(parents=True, exist_ok=True)
output_file: Path = output_dir / 'crawled_python_libraries_2025.csv'

# CSV 파일 작성
fieldnames: List[str] = ['name', 'url', 'description', 'category' ]
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer: csv.DictWriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for item in crawled_data:
        writer.writerow(item)

print(f"\nCSV 파일이 저장되었습니다: {output_file}")
print(f"총 {len(crawled_data)}개의 데이터가 저장되었습니다.")
print(f"\nCSV 컬럼: {', '.join(fieldnames)}")



CSV 파일이 저장되었습니다: crawled_python_libraries_2025.csv
총 168개의 데이터가 저장되었습니다.

CSV 컬럼: name, url, description, category
