### 8.2 클로링 예제
8.2.1 명언 크롤링하기

In [1]:
import requests as rq

url = 'https://quotes.toscrape.com/'
quote = rq.get(url)

print(quote)

<Response [200]>


In [2]:
quote.content[:1000]

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">\xe2\x80\x9cThe world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.\xe2\x80\

In [3]:
from bs4 import BeautifulSoup

quote_html = BeautifulSoup(quote.content, 'html.parser')
quote_html.head()

[<meta charset="utf-8"/>,
 <title>Quotes to Scrape</title>,
 <link href="/static/bootstrap.min.css" rel="stylesheet"/>,
 <link href="/static/main.css" rel="stylesheet"/>]

###### 8.2.1.1 find() 함수를 이용한 크롤링

In [4]:
quote_div = quote_html.find_all('div', class_='quote')

quote_div[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

In [5]:
quote_span = quote_div[0].find_all('span', class_='text')

quote_span

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>]

In [6]:
quote_span[0].text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [7]:
quote_div = quote_html.find_all('div', class_='quote')

[i.find_all('span', class_='text')[0].text for i in quote_div]

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

###### 8.2.1.2 select() 함수를 이용한 크롤링

In [8]:
quote_text = quote_html.select('div.quote > span.text')

quote_text

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>,
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>,
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>,
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>,
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>,
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>,
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.

In [9]:
quote_text_list = [i.text for i in quote_text]

quote_text_list

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

In [10]:
quote_author = quote_html.select('div.quote > span > small.author')
quote_author_list = [i.text for i in quote_author]

quote_author_list

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

In [11]:
quote_link = quote_html.select('div.quote > span > a')

quote_link

[<a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/J-K-Rowling">(about)</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/Jane-Austen">(about)</a>,
 <a href="/author/Marilyn-Monroe">(about)</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a href="/author/Andre-Gide">(about)</a>,
 <a href="/author/Thomas-A-Edison">(about)</a>,
 <a href="/author/Eleanor-Roosevelt">(about)</a>,
 <a href="/author/Steve-Martin">(about)</a>]

In [12]:
quote_link[0]['href']

'/author/Albert-Einstein'

In [13]:
['http://quotes.toscrape.com' + i['href'] for i in quote_link]

['http://quotes.toscrape.com/author/Albert-Einstein',
 'http://quotes.toscrape.com/author/J-K-Rowling',
 'http://quotes.toscrape.com/author/Albert-Einstein',
 'http://quotes.toscrape.com/author/Jane-Austen',
 'http://quotes.toscrape.com/author/Marilyn-Monroe',
 'http://quotes.toscrape.com/author/Albert-Einstein',
 'http://quotes.toscrape.com/author/Andre-Gide',
 'http://quotes.toscrape.com/author/Thomas-A-Edison',
 'http://quotes.toscrape.com/author/Eleanor-Roosevelt',
 'http://quotes.toscrape.com/author/Steve-Martin']

###### 8.2.1.3 모든 페이지 데이터 크롤링하기

In [14]:
#import requests as rq
#from bs4 import BeautifulSoup
import time

text_list = []
author_list = []
infor_list = []

# ... 이하 생략 ...

In [15]:
import pandas as pd

pd.DataFrame({'text': text_list, 'author': author_list, 'infor': infor_list})

Unnamed: 0,text,author,infor


8.2.2 금융 속보 크롤링

In [16]:
#import requests as rq
#from bs4 import BeautifulSoup

url = 'https://finance.naver.com/news/news_list.nhn?mode=LSS2D&section_id=101&section_id2=258'
data = rq.get(url)
html = BeautifulSoup(data.content, 'html.parser')
html_select = html.select('dl > dd.articleSubject > a')

html_select[0:3]

[<a href="/news/news_read.naver?article_id=0011989716&amp;office_id=003&amp;mode=LSS2D&amp;type=0§ion_id=101§ion_id2=258§ion_id3=&amp;date=20230722&amp;page=1" title="동전주 벗어난 리플…어디까지 오를까[이지영의 코인세상]">동전주 벗어난 리플…어디까지 오를까[이지영의 코인세상]</a>,
 <a href="/news/news_read.naver?article_id=0000018105&amp;office_id=648&amp;mode=LSS2D&amp;type=0§ion_id=101§ion_id2=258§ion_id3=&amp;date=20230722&amp;page=1" title="'구조요청' 보낸 전경련, 4대 그룹 선택은">'구조요청' 보낸 전경련, 4대 그룹 선택은</a>,
 <a href="/news/news_read.naver?article_id=0006944774&amp;office_id=421&amp;mode=LSS2D&amp;type=0§ion_id=101§ion_id2=258§ion_id3=&amp;date=20230722&amp;page=1" title="가상자산 거래소, 거래량 줄자 '신규상장' 늘렸다…&quot;닥사 심사 고도화해야&quot;">가상자산 거래소, 거래량 줄자 '신규상장' 늘렸다…"닥사 심사 고도화해야"</a>]

In [17]:
html_select[0]['title']

'동전주 벗어난 리플…어디까지 오를까[이지영의 코인세상]'

In [18]:
[i['title'] for i in html_select]

['동전주 벗어난 리플…어디까지 오를까[이지영의 코인세상]',
 "'구조요청' 보낸 전경련, 4대 그룹 선택은",
 '가상자산 거래소, 거래량 줄자 \'신규상장\' 늘렸다…"닥사 심사 고도화해야"',
 '아기 입맛 사로잡았다...인플레 직격탄에도 잘 팔리는 이유 [추동훈의 흥부전]',
 '이복현은 왜 증권사 10곳 긴급 소집했나[최훈길의뒷담화]',
 "주주 간 차등이 인정되는 경우가 있다?…'평등과 차등 사이'",
 '커지는 ESG 공시 시장…인증은 누가 맡을까?',
 '개장할 땐 2만원이였는데 마감 땐 1만원…단타대회 된 이녀석들',
 "'대구의 갓기업'에 무슨 일이…개미들 2700억 폭풍 매수 [박의명의 불개미 구조대]",
 '[fn마켓워치]호서대, 50억 OCIO로',
 "'5조 대어' HMM 매각 개시…인수전 초반 후보군 살펴보니[시그널]",
 '[fn마켓워치]대광건영, 27홀 대중제 큐로CC 인수',
 '"코스피로 이사 간대" 소문 돌더니…개미들 우르르 몰려든 곳 [진영기의 찐개미 찐투자]',
 '하나금융, KDB생보 인수…‘인수자금’에 달린 신용도',
 "[fn마켓워치]삼프로TV 우회상장..'동학개미' 환호할까",
 "당장 성과는 부진해도..멀리 보면 ESG채권 '투자대안'",
 '"나만 없어 에코프로" 벼락거지 한숨… 공매도 이긴 투자자 환호 \'희비\'',
 '美 합참의장 "北핵·미사일 현실적 위협…한반도 전쟁 가능성"',
 '[다음주 증시 전망] 2분기 실적 옥석 가리기…반도체 업황 의구심 해소할까']

8.2.3 표 크롤링하기

In [19]:
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_stock_market_capitalization'
tbl = pd.read_html(url)

tbl[0].head()

Unnamed: 0,Country,Total market cap (in mil. US$)[2],Total market cap (% of GDP)[3],Number of domestic companies listed[4],Year
0,United States,44719661,194.5,4266,2020
1,China,13214311,83.0,4154,2020
2,Japan,6718220,122.2,3754,2020
3,Hong Kong,6130420,1768.8,2353,2020
4,India,3750000,103.0,5270,2023


8.2.4 기업 공시 채널에서 오늘의 공시 불러오기

In [20]:
# import requests as rq
# from bs4 import BeautifulSoup
# import pandas as pd

url = 'https://kind.krx.co.kr/disclosure/todaydisclosure.do'
payload = {
    'method': 'searchTodayDisclosureSub',
    'currentPageSize': '15',
    'pageIndex': '1',
    'orderMode': '0',
    'orderStat': 'D',
    'forward': 'todaydisclosure_sub',
    'chose': 'S',
    'todayFlag': 'N',
    'selDate': '2023-07-21'
}

data = rq.post(url, data=payload)
html = BeautifulSoup(data.content, 'html.parser')

print(html)


<section class="scrarea type-00">
<table class="list type-00 mt10" summary="시간, 회사명, 공시제목, 제출인, 차트/주가">
<caption>목록</caption>
<colgroup>
<col width="9%"/>
<col width="22%"/>
<col width="*"/>
<col width="16%"/>
<col width="9%"/>
</colgroup>
<thead>
<tr class="first active" id="title-contents">
</tr>
</thead>
<tbody>
<tr class="first" id="parkman">
<td class="first txc">18:45</td>
<td><img alt="코스닥" class="vmiddle legend" src="/images/common/icn_t_ko.gif"/> <a href="#companysum" id="companysum" onclick="companysummary_open('22633'); return false;" title="신테카바이오"> 신테카바이오</a> </td>
<td><a href="#viewer" onclick="openDisclsViewer('20230721000674','')" title="전환사채권발행결정(제2회차)">전환사채권발행결정(제2회차)</a></td>
<td>신테카바이오</td>
<td class="txc">
<a class="btn ico chart-00" href="#" onclick="openDisclsChart('22633');return false;" title="공시차트"><span>공시차트</span></a>
<a class="btn ico chart-01" href="#" onclick="fnPopStockPrices('22633');return false;" title="주가차트"><span>주가차트</span></a>
</td>
</tr>
<tr id=

In [21]:
html_unicode = html.prettify()
tbl = pd.read_html(html.prettify())

tbl[0].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,18:45,신테카바이오,전환사채권발행결정(제2회차),신테카바이오,공시차트 주가차트
1,18:39,버킷스튜디오,횡령ㆍ배임 혐의 진행사항,버킷스튜디오,공시차트 주가차트
2,18:37,엑서지21,조회공시요구(현저한시황변동)에대한답변(미확정),엑서지21,공시차트 주가차트
3,18:34,리더스 기술투자,[정정] 신주인수권행사가액의조정,리더스 기술투자,공시차트 주가차트
4,18:29,유니테크노,불성실공시법인지정예고(공시불이행),코스닥시장본부,공시차트 주가차트
