In [2]:
%%capture
!pip install langchain-community

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 여러 개의 url 지정 가능
url1 = "https://blog.langchain.dev/customers-replit/"
url2 = "https://blog.langchain.dev/langgraph-v0-2/"

loader = WebBaseLoader(
    web_paths=(url1, url2),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("article-header", "article-content")
        )
    ),
)
docs = loader.load()
len(docs)




2

- 브라우저처럼 보이도록 User-Agent 값을 설정.
- 일부 사이트는 자동화된 스크래핑을 차단하지만, 브라우저 요청처럼 보이면 정상적으로 데이터를 가져올 수 있음.
- 웹 서버가 "이 요청은 사람이 보낸 것"이라고 인식하도록 함.


In [10]:
# 여러 개의 URL 지정 가능
url1 = "https://blog.langchain.dev/customers-replit/"
url2 = "https://blog.langchain.dev/langgraph-v0-2/"


# 사용자 User-Agent 설정
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"

# WebBaseLoader에 User-Agent 추가
loader = WebBaseLoader(
    web_paths=(url1, url2),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("article-header",         # article-header : (기사 제목 부분)
                    "article-content")        # article-content" (본문 내용 부분)
        )
    ),
    requests_kwargs={
        "headers": {
            "User-Agent": user_agent
        }
    }
)

docs = loader.load()
print(f"로드된 문서 개수: {len(docs)}")


로드된 문서 개수: 2


In [11]:
len(docs)

2

In [12]:
docs[0]

Document(metadata={'source': 'https://blog.langchain.dev/customers-replit/'}, page_content='\nReplit is at the forefront of AI innovation with its platform that simplifies writing, running, and collaborating on code for over 30+ million developers. They recently released Replit Agent, which immediately went viral due to the incredible applications people could easily create with this tool.Behind the scenes, Replit Agent has a complex workflow built on LangGraph, which enables a highly custom agentic workflow with a high-degree of control and parallel execution. A major benefit of using LangGraph was the seamless integration with LangSmith, which gave Replit deep visibility into their agent interactions to debug tricky issues.\xa0The level of complexity required for Replit Agent also pushed the boundaries of LangSmith. The LangChain and Replit teams worked closely together to add functionality to LangSmith that would satisfy their LLM observability needs. Specifically, there were three 

In [13]:
docs[1]

Document(metadata={'source': 'https://blog.langchain.dev/langgraph-v0-2/'}, page_content="\nToday, we’re excited to announce the stable release of LangGraph v0.2, which introduces a new ecosystem of LangGraph checkpointer libraries. These simplify the creation and customization of checkpointers, which allows users to build more resilient LLM applications with smooth session memory, robust error recovery, and human-in-the-loop features.Why we built LangGraph v0.2One of the key pillars of LangGraph is its built-in persistence layer, implemented through checkpointers. When you use a checkpointer with a graph, you can interact with and manage the graph's state. The checkpointer saves a checkpoint of the graph state at each step, enabling several powerful capabilities, including:Session memory: Store history (checkpoints) of user interactions and resume from a saved checkpoint in follow up interactionsError recovery: Recover from failures at any given step in the graph execution by continui

In [8]:
f'https://startcoding.pythonanywhere.com/basic?page=1'

'https://startcoding.pythonanywhere.com/basic?page=1'

In [9]:
[f'https://startcoding.pythonanywhere.com/basic?page={i}' for i in range(1,5)]

['https://startcoding.pythonanywhere.com/basic?page=1',
 'https://startcoding.pythonanywhere.com/basic?page=2',
 'https://startcoding.pythonanywhere.com/basic?page=3',
 'https://startcoding.pythonanywhere.com/basic?page=4']

### 구글에서 검색어 검색하고 기사를 스크래핑

In [16]:
from google.colab import userdata
YOUR_SERPAPI_KEY = userdata.get('serpapi')
print(YOUR_SERPAPI_KEY[:10])

4eee03a51b


In [18]:
%%capture
!pip install google-search-results

In [22]:
from serpapi import GoogleSearch
from langchain_community.document_loaders import WebBaseLoader
import bs4

# SerpAPI 키 설정
SERPAPI_API_KEY = YOUR_SERPAPI_KEY  # 여기에 네 키 입력해줘

# 검색어 목록
queries = ["LangChain", "ChatGPT plugins", "AI agents", "RAG pipeline", "LLM orchestration"]

# 기사 URL 저장 리스트
article_urls = []

# 검색어마다 뉴스 기사 URL 1개씩 추출
for query in queries:
    params = {
        "engine": "google",
        "q": query,
        "tbm": "nws",  # 뉴스 검색
        "api_key": SERPAPI_API_KEY,
        "num": 2
    }
    search = GoogleSearch(params)

    results = search.get_dict()
    print(f"SerpAPI results: {results}")  # Print the raw response
    news_results = results.get("news_results", [])

    for result in news_results[:1]:  # 검색어당 기사 1개씩만 가져오기
        link = result.get("link")
        if link:
            article_urls.append(link)

# WebBaseLoader로 기사 로드
loader = WebBaseLoader(
    web_paths=article_urls,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(name=["article", "main", "div"], class_=True)
    ),
    requests_kwargs={
        "headers": {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
        }
    }
)

docs = loader.load()
print(f"가져온 기사 수: {len(docs)}")
for i, doc in enumerate(docs, 1):
    print(f"\n--- [Article {i}] ---")
    print(doc.page_content[:300])  # 본문 일부 미리보기


SerpAPI results: {'search_metadata': {'id': '67dcc7ab8d7b2a72e0e32642', 'status': 'Success', 'json_endpoint': 'https://serpapi.com/searches/cf55c49dcd3f1d59/67dcc7ab8d7b2a72e0e32642.json', 'created_at': '2025-03-21 01:58:03 UTC', 'processed_at': '2025-03-21 01:58:03 UTC', 'google_url': 'https://www.google.com/search?q=LangChain&oq=LangChain&num=2&tbm=nws&sourceid=chrome&ie=UTF-8', 'raw_html_file': 'https://serpapi.com/searches/cf55c49dcd3f1d59/67dcc7ab8d7b2a72e0e32642.html', 'total_time_taken': 3.03}, 'search_parameters': {'engine': 'google', 'q': 'LangChain', 'google_domain': 'google.com', 'num': '2', 'device': 'desktop', 'tbm': 'nws'}, 'search_information': {'query_displayed': 'LangChain', 'total_results': 588, 'time_taken_displayed': 0.2, 'news_results_state': 'Results for exact spelling'}, 'news_results': [{'position': 1, 'link': 'https://www.prnewswire.com/news-releases/qualtrics-and-langchain-announce-partnership-to-develop-highly-specialized-experience-agents-302405449.html', 't