In [4]:
from bs4 import BeautifulSoup

In [62]:
# html.parser
# lxml ***
# lxml-xml, xml
# html5lib

In [71]:
import requests
# 200 => OK
# 400 => My Fault
# 500 => You Fault => Traffic 과도하게 증가
header={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}

def download(url, params ={}, retries = 3) :
    resp = None
    try :
        resp = requests.get(url, params = params, headers = header)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e :
        if 500 <= e.response.status_code < 600 and retries > 0:
            print(retries)
            resp = download(url, params, retries - 1)
        else :
            print(e.response.status_code)
            print(e.response.reason)
            print(e.response.headers)
    
    return resp

In [27]:
html = """
<html>
<head></head>
<body>
    <div id=test></div>
    <div id="result">
        <p class="row">
            <a class="red">go to page1</a>
            <a class="blue">go to page2</a>
    </div>

</body>
</html>
"""

In [28]:
from bs4 import BeautifulSoup
dom = BeautifulSoup(html,"lxml")

In [29]:
dom.div["id"], dom.a["class"], type(dom.div), type(dom.span)

('test', ['red'], bs4.element.Tag, NoneType)

In [30]:
try :
    dom.span.attr
except AttributeError as e :
    print("Not Found")

Not Found


In [9]:
dom.html.body.p, dom.p, [_ for _ in dom.p.children]

(<p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 </p>, <p class="row">
 <a class="red">go to page1</a>
 <a class="blue">go to page2</a>
 </p>, ['\n',
  <a class="red">go to page1</a>,
  '\n',
  <a class="blue">go to page2</a>,
  '\n'])

In [10]:
dom.prettify()

'<html>\n <head>\n </head>\n <body>\n  <div>\n  </div>\n  <div id="result">\n   <p class="row">\n    <a class="red">\n     go to page1\n    </a>\n    <a class="blue">\n     go to page2\n    </a>\n   </p>\n  </div>\n </body>\n</html>\n'

In [16]:
dom.h1, dom.h2, dom.h3, dom.h4
try:
    print(dom.h4.text)
except AttributeError as e :
    print(dom.h3.text)

AttributeError: 'NoneType' object has no attribute 'text'

In [17]:
dom.find("a")

<a class="red">go to page1</a>

In [38]:
dom.find_all("a"),\
dom.find_all("p")

([<a class="red">go to page1</a>, <a class="blue">go to page2</a>],
 [<p class="row">
  <a class="red">go to page1</a>
  <a class="blue">go to page2</a>
  </p>])

In [55]:
dom.find_all({"div","p"}), \
dom.find_all("div",{"id":"result"}), \
dom.find_all("",attrs={"class":"red"}), \
dom.find_all("a", recursive=False), \
dom.find_all(text="go to page1"),\
dom.find_all("a", limit=2), \
dom.find_all("a", limit=1)

([<div id="test"></div>, <div id="result">
  <p class="row">
  <a class="red">go to page1</a>
  <a class="blue">go to page2</a>
  </p></div>, <p class="row">
  <a class="red">go to page1</a>
  <a class="blue">go to page2</a>
  </p>], [<div id="result">
  <p class="row">
  <a class="red">go to page1</a>
  <a class="blue">go to page2</a>
  </p></div>], [<a class="red">go to page1</a>], [], ['go to page1'], [<a class="red">go to page1</a>,
  <a class="blue">go to page2</a>], [<a class="red">go to page1</a>])

In [58]:
dom.find("a",{"red":"blue"})

In [183]:
html = download("http://pythonscraping.com/pages/page3.html")
exercise = BeautifulSoup(html.text,"lxml")

In [89]:
exercise

<html>
<head>
<style>
img{
	width:75px;
}
table{
	width:50%;
}
td{
	margin:10px;
	padding:10px;
}
.wrapper{
	width:800px;
}
.excitingNote{
	font-style:italic;
	font-weight:bold;
}
</style>
</head>
<body>
<div id="wrapper">
<img src="../img/gifts/logo.jpg" style="float:left;"/>
<h1>Totally Normal Gifts</h1>
<div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
hand-curated by well-paid, free-range Tibetan monks.<p>
We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
123 Main St.<br/>
Abuja, Nigeria
We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) frien

In [106]:
footer = exercise.find("", {"id":"footer"})
footer

<div id="footer">
© Totally Normal Gifts, Inc. <br/>
+234 (617) 863-0736
</div>

In [111]:
footer.name, footer["id"], \
footer.find_parent().name, footer.find_parent()["id"], \
footer.find_parent().find_parent().name #(.name=태그이름)(.attr)

('div', 'footer', 'div', 'wrapper', 'body')

In [112]:
footer.find_parent()["id"]

'wrapper'

In [113]:
parent = footer.find_parent()
parent.name , parent.attrs

('div', {'id': 'wrapper'})

In [114]:
[ _.name for _ in parent.find_all()]

['img',
 'h1',
 'div',
 'p',
 'br',
 'br',
 'table',
 'tr',
 'th',
 'th',
 'th',
 'th',
 'tr',
 'td',
 'td',
 'span',
 'td',
 'td',
 'img',
 'tr',
 'td',
 'td',
 'span',
 'td',
 'td',
 'img',
 'tr',
 'td',
 'td',
 'span',
 'td',
 'td',
 'img',
 'tr',
 'td',
 'td',
 'span',
 'td',
 'td',
 'img',
 'tr',
 'td',
 'td',
 'span',
 'td',
 'td',
 'img',
 'div',
 'br']

In [115]:
[_.name for _ in footer.find_previous_siblings()]

['table', 'div', 'h1', 'img']

In [116]:
parent.find_all(recursive=False)[3]

<table id="giftList">
<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gift

In [121]:
exercise.table.name, exercise.find("table").name

('table', 'table')

In [126]:
[_.text.strip() for _ in parent.find_all(recursive=False)][3]

'Item Title\n\nDescription\n\nCost\n\nImage\n\n\nVegetable Basket\n\nThis vegetable basket is the perfect gift for your health conscious (or overweight) friends!\nNow with super-colorful bell peppers!\n\n$15.00\n\n\n\n\nRussian Nesting Dolls\n\nHand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! 8 entire dolls per set! Octuple the presents!\n\n$10,000.52\n\n\n\n\nFish Painting\n\nIf something seems fishy about this painting, it\'s because it\'s a fish! Also hand-painted by trained monkeys!\n\n$10,005.00\n\n\n\n\nDead Parrot\n\nThis is an ex-parrot! Or maybe he\'s only resting?\n\n$0.50\n\n\n\n\nMystery Box\n\nIf you love suprises, this mystery box is for you! Do not place on light-colored surfaces. May cause oil staining. Keep your friends guessing!\n\n$1.50'

In [143]:
table = footer.find_previous_sibling()
tr = table.find_all("tr", {"class":"gift"})
for _ in tr :
    print(_.find_all(recursive=False)[2].text.strip())

$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [139]:
exercise.find_all()

[<html>
 <head>
 <style>
 img{
 	width:75px;
 }
 table{
 	width:50%;
 }
 td{
 	margin:10px;
 	padding:10px;
 }
 .wrapper{
 	width:800px;
 }
 .excitingNote{
 	font-style:italic;
 	font-weight:bold;
 }
 </style>
 </head>
 <body>
 <div id="wrapper">
 <img src="../img/gifts/logo.jpg" style="float:left;"/>
 <h1>Totally Normal Gifts</h1>
 <div id="content">Here is a collection of totally normal, totally reasonable gifts that your friends are sure to love! Our collection is
 hand-curated by well-paid, free-range Tibetan monks.<p>
 We haven't figured out how to make online shopping carts yet, but you can send us a check to:<br/>
 123 Main St.<br/>
 Abuja, Nigeria
 We will then send your totally amazing gift, pronto! Please include an extra $5.00 for gift wrapping.</p></div>
 <table id="giftList">
 <tr><th>
 Item Title
 </th><th>
 Description
 </th><th>
 Cost
 </th><th>
 Image
 </th></tr>
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift f

In [145]:
for _ in exercise.find_all("tr",{"class":"gift"}):
    print(_.find_all(recursive=False)[2].text.strip())

$15.00
$10,000.52
$10,005.00
$0.50
$1.50


In [151]:
import re
[_.text.strip() for _ in exercise.find_all("td", text=re.compile("[0-9]+.\d+"))]

['$15.00', '$10,000.52', '$10,005.00', '$0.50', '$1.50']

In [176]:
[ _["src"].text.strip() for _ in exercise.find_all("img",{"src":re.compile("../img/gifts/img/d+.jpg")})]

[]

In [177]:
[ _ for _ in exercise.find_all("img",{"src":re.compile("../img/gifts/img\d+.jpg")})]

[<img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [181]:
requests.compat.urljoin(html.request.url,"page")

'http://pythonscraping.com/pages/page'

In [182]:
"구글검색결과 , 네이버검색결과"

'구글검색결과 , 네이버검색결과'

In [187]:
html = download("http://www.google.com.com/search",{"q":"박보영"})
google = BeautifulSoup(html.text,"lxml")
#len(google.find_all("h3",{"class":"LC201b"}))

In [None]:
result = list()
for _ in google.find_all("h3",{"class":"LC201b"}):
    result.append((_.text.strip(),_.find_parent()["href"]))
    #print(_.text.strip())
    #print(_.find_parent()["href"])

In [None]:
[ _ for h3 in google.find_all("h3",{"class":"LC201b"})]    

In [188]:
html = download("https://search.naver.com/search.naver",{"query":"박보영","oquery":"박보영"})
naver = BeautifulSoup(html.text,"lxml")

In [191]:
naver.text

'          박보영 : 네이버 통합검색     naver = window.naver || {}; naver.search = naver.search || {}; var g_D = 0 ; if (!String.prototype.trim) { String.prototype.trim = function () { return this.replace(/^[\\s\\uFEFF\\xA0]+|[\\s\\uFEFF\\xA0]+$/g, \'\'); }; } function urlencode (q) { return escape(q).replace(/\\+/g, "%2B") ; } function urlexpand (url) { var href = document.location.href ; if (url == "") return href ; if (url.match(/^[-.A-Za-z]+:/)) return url ; if (url.charAt(0) == \'#\') return href.split("#")[0] + url ; if (url.charAt(0) == \'?\') return href.split("?")[0] + url ; if (url.charAt(0) == \'/\') return href.replace(/([^:\\/])\\/.*$/, "$1") + url ; return href.substring(0, href.lastIndexOf("/")+1) + url ; } naver.search.error = (function () { var errorList = Array() ; return { add : function (s) { errorList.push(s) ; }, clear : function () { delete errorList ; }, get : function (s) { return errorList ; }, getString : function (d) { if (typeof d === \'undefined\') d = \'|\' ; retur

In [194]:
import re
[ _ for _ in naver.find_all("li",{"id":re.compile("sp_nws_all\d")})]

[<li id="sp_nws_all1"> <div class="thumb"><a class="sp_thmb thmb80" href="http://news1.kr/articles/?3613110" onclick="return goOtherCR(this, 'a=nws_all*a.img&amp;r=1&amp;i=08138263_000000000000000003972619&amp;g=421.0003972619&amp;u='+urlencode(this.href));" target="_blank"><img alt='[N현장] "목표는 20%"' class="" height="80" onerror="this.parentNode.style.display='none';" src="https://search.pstatic.net/common/?src=https%3A%2F%2Fimgnews.pstatic.net%2Fimage%2Forigin%2F421%2F2019%2F05%2F03%2F3972619.jpg&amp;type=ofullfill80_80_q75_re2" width="80"/><span class="thmb_v"></span></a></div> <dl> <dt><a class="_sp_each_title" href="http://news1.kr/articles/?3613110" onclick="return goOtherCR(this, 'a=nws_all*a.tit&amp;r=1&amp;i=08138263_000000000000000003972619&amp;g=421.0003972619&amp;u='+urlencode(this.href));" target="_blank" title="[N현장] &quot;목표는 20%&quot; 박보영X유제원 '어비스', '오나귀' 신드롬 재현할까(종합)">[N현장] "목표는 20%" <strong class="hl">박보영</strong>X유제원 '어비스', '오나귀' 신드롬 재현할까(종...</a></dt> <dd class="txt_

In [239]:
result = list()
for _ in naver.find_all("dt"):
    #print("-".join([_.name for _ in _.find_parents(limit=4)]), "dl-li-ul-div", "-".join([_.name for _ in _.find_parents(limit=4)]) == "dl-li-ul-div")
    if "-".join([_.name for _ in _.find_parents(limit=4)]) == "dl-li-ul-div" :
        a = _.find("a")
        if a:
            print(a.text.strip())
            print(a["href"])
    #result.append((_.text.strip(),_.find_parent()["href"]))
    #print(_.find_parent()["href"])

영화 너의결혼식 ...
https://music.naver.com/album/index.nhn?albumId=2500698
오 나의 귀신님 ...
https://music.naver.com/album/index.nhn?albumId=565464
고세연
?where=nexearch&sm=tab_etc&query=%EC%96%B4%EB%B9%84%EC%8A%A4%EA%B3%A0%EC%84%B8%EC%97%B0
도봉순
?where=nexearch&sm=tab_etc&query=%ED%9E%98%EC%8E%88%EC%97%AC%EC%9E%90%EB%8F%84%EB%B4%89%EC%88%9C%EB%8F%84%EB%B4%89%EC%88%9C
[N현장] "목표는 20%" 박보영X유제원 '어비스', '오나귀' 신드롬 재현할까(종...
http://news1.kr/articles/?3613110
박보영X안효섭 '어비스', 이시언의 20% 소망 이뤄줄까[종합]
http://star.mt.co.kr/stview.php?no=2019050313501862541
[HD영상] '어비스' 안효섭 "'세젤흔녀' 박보영? 얼굴 보면 몰입 안 된다"
http://digitalchosun.dizzo.com/site/data/html_dir/2019/05/03/2019050380185.html
“흔녀된 뽀블리”…박보영X유제원 감독, ‘오나귀’ 잇는 신드롬 일으킬까(종...
http://star.mbn.co.kr/view.php?year=2019&no=289721&refer=portal
'어비스' 안효섭, 박보영이 흔녀? '몰입하기...
https://tv.naver.com/v/8245633
'어비스' 이시언, 박보영 스킨십 장면 있는거...
https://tv.naver.com/v/8245448
'어비스' 박보영, '세젤흔녀' 연기.. 김사랑과...
https://tv.naver.com/v/8245318
'어비스' 박보영, 뽀블리가 마블리를...
https://tv.naver.com/v/8245

In [234]:
[ _.text.strip() for _ in naver.find_all("a",{"class":["_sp_each_title","sh_cafe_title","sh_blog_title","tit","title_link"]})]

['[N현장] "목표는 20%" 박보영X유제원 \'어비스\', \'오나귀\' 신드롬 재현할까(종...',
 "박보영X안효섭 '어비스', 이시언의 20% 소망 이뤄줄까[종합]",
 '[HD영상] \'어비스\' 안효섭 "\'세젤흔녀\' 박보영? 얼굴 보면 몰입 안 된다"',
 '“흔녀된 뽀블리”…박보영X유제원 감독, ‘오나귀’ 잇는 신드롬 일으킬까(종...',
 '박보영 이런 모습도 있네요?',
 '2019.04.29 박보영 쏘피 바디피트 팬사인회 현장',
 '박보영은 언제쯤 결혼을 할까요?',
 '박보영 나오는 드라마 하나봐요',
 '럭키슈에뜨 박보영 가디건',
 '박보영 : 오 나의 귀신님 나봉선',
 '[티저] 초면에 사랑합니다 vs 어비스 (김영광vs박보영)',
 '월화드라마 어비스로 돌아올 박보영 스타일에 대해 알아보자',
 '박보영 김영광 열애 나이와키 군대 미우새 출연',
 '박보영 김영광 열애 키차이 아버지 군대 ?',
 '박보영',
 '연지해',
 'Park Bo-young',
 '박보영 - 나무위키',
 '박보영 갤러리',
 '박보영']

In [240]:
html = download("https://search.daum.net/search",{"q":"박보영"})
daum = BeautifulSoup(html.text,"lxml")

In [241]:
daum

<!DOCTYPE html>
<html class="win ie11" lang="ko" xmlns="http://www.w3.org/1999/xhtml">
<head profile="http://a9.com/-/spec/opensearch/1.1/">
<meta content="text/html;charset=utf-8" http-equiv="content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="off" name="autocomplete"/>
<meta content="always" name="referrer"/>
<meta content="telephone=no" name="format-detection"/>
<meta content="박보영 – Daum 검색" property="og:title"/>
<meta content="https://search.daum.net/search?w=tot&amp;q=%EB%B0%95%EB%B3%B4%EC%98%81" property="og:url"/>
<meta content="Daum 검색에서 박보영에 대한 최신정보를 찾아보세요." property="og:description"/>
<meta content="https://search1.daumcdn.net/search/statics/common/img/og_search.png" property="og:image"/>
<meta content="다음검색" property="og:site_name"/>
<title>박보영 – Daum 검색</title>
<link href="//search.daum.net/OpenSearch.xml" rel="search" title="Daum" type="application/opensearchdescription+xml"/>
<link charset="utf-8" href="//search1.daumcdn.net/search/static

In [242]:
for _ in daum.find_all("div",{"class":"wrap_tit"}):
    a = _.find("a")
    if a:
        print(a.text.strip())
        print(a["href"])


"세젤흔녀로 변신한 박보영"..유제원 감독X박보영 '어비스' (종합)
http://v.media.daum.net/v/20190503153440722?f=o
'어비스' 이성재 "다시 태어난다면? 예쁜 박보영으로"..박보영 '폭소'
http://v.media.daum.net/v/20190503145429254?f=o
박보영이 가장 흔한 여자, 설득력 있을까 '어비스'
http://v.media.daum.net/v/20190503172414326?f=o
[현장]'어비스' 박보영 "김사랑과 차이? 커졌다 작아졌다.."
http://v.media.daum.net/v/20190503165134297?f=o
김영광 박보영 열애 터진 이유
http://adam24eve.tistory.com/858
박보영 실제 키는 도대체 몇일까?
http://papa0717.tistory.com/223
박보영 나이 키 몸매 대박
http://k3k2y.tistory.com/35
박보영 키 나이 인스타그램 드라마 어비스
http://listup.tistory.com/248
드라마 어비스 인물 소개, 예고편(박보영, 안효섭 주연)
http://cafe.daum.net/subdued20club/ReHf/2282606?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영과 역대급 케미뽐낸 상대배우 고르기
http://cafe.daum.net/subdued20club/ReHf/2280152?q=%EB%B0%95%EB%B3%B4%EC%98%81
런닝맨 나올 때마다 케미 보여준 송지효X박보영.jpgif
http://cafe.daum.net/ok1221/9Zdf/1524913?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영이 왜 못 오를 나무냐는 박수홍.jpg
http://cafe.daum.net/ASMONACOFC/gAVU/1243818?q=%EB%B0%95%EB%B3%B4%EC%98%81
박보영
https://ko.wikipedia.org/wiki/%EB%B0%95%EB%B3%B4