In [1]:
from bs4 import BeautifulSoup #bs4 라이브러리 내의 BeautifulSoup 가져옴

In [138]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [134]:
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify()) #html 구조 표

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [129]:
print(soup.title) #태그의 데이터
print(soup.title.name) #특정 태그의 이름
print(soup.p.name) #특정 태그의 이름
print(soup.title.parent.name) #상위 태그의 이름
print(soup.p) #특정 태그의 데이
print(soup.p['class']) #p 태그의 class는 title

<title>The Dormouse's story</title>
title
p
head
<p class="title"><b>The Dormouse's story</b></p>
['title']


In [5]:
print(soup.find_all('a')) #특정 태그를 전부 찾


print(soup.find(id="link3")) #특정 id값의 부분을 찾을 수 있음

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [6]:
for link in soup.find_all('a'):
  print(link.get("href"))
  print(link)

http://example.com/elsie
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
http://example.com/lacie
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
http://example.com/tillie
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>


In [7]:
print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...



In [8]:
#파서의 종류
#python : html.parser 장점 : 느리지않은 속도  //  단점 : lxml보다 느림, html5lib보다 문법적으로 깐깐함
#lxml : lxml 장점 : 매우 빠름 // 단점 : 따로 c언어 작업을 해줘야 함
#xml : xml or lxml-xml 장점 : 매우빠름, xml 지원 파서중 현재까지 지원되는 유일한 파서 // 단점 : 외부 c 종속성
#html5lib : html5lib 장점 : 문법적으로 널널함, 웹브라우저와 동일한 방식으로 구문 분석, HTML5 생성 // 단점 : 아주 느림, python 종속

In [9]:
print(BeautifulSoup("<a><b/></a>", "html.parser"))
print(BeautifulSoup("<a><b/></a>", "xml"))
print(BeautifulSoup("<a><b/></a>", "lxml")) #p태그 무시
print(BeautifulSoup("<a></p>", "lxml")) #p태그 무시
print(BeautifulSoup("<a><b/></a>", "html5lib")) #비어있는 head태그 추가

<a><b></b></a>
<?xml version="1.0" encoding="utf-8"?>
<a><b/></a>
<html><body><a><b></b></a></body></html>
<html><body><a></a></body></html>
<html><head></head><body><a><b></b></a></body></html>


In [10]:
#기본적으로 HTML를 Python 객체 트리로 변환 함
#이를 이용하기 위해 태그, 이름, 속성을 이용해 접근 가능함

In [11]:
tag = soup.b
type(tag)

In [12]:
tag.name

'b'

In [13]:
tag.name = 'p'
tag

<p>The Dormouse's story</p>

In [137]:
soup_1 = soup.a
soup_1['id'] ='change'
soup_2 = soup.a
soup_2['id']

'change'

In [139]:
tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b
tag['id']

'boldest'

In [16]:
tag.attrs

{'id': 'boldest'}

In [17]:
tag.attrs.keys()

dict_keys(['id'])

In [18]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.p['class']

['body', 'strikeout']

In [19]:
css_soup.p['class'] = ['test', 'change']
print(css_soup.p)

<p class="test change"></p>


In [20]:
soup.a['id']

'link1'

In [141]:
soup.a.get_attribute_list('id')

['change']

In [142]:
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml') #xml은 그냥 문자열로 반환해 준다.
xml_soup.p['class']

In [23]:
class_is_multi= { '*' : 'class'} #해당 방식을 사용해서 xml에서도 html.parser과 같이 다중값을 사용할 수 있다.
xml_soup = BeautifulSoup('<p class="body strikeout"></p>', 'xml', multi_valued_attributes=class_is_multi)
xml_soup.p['class']

['body', 'strikeout']

In [24]:
test = soup.a
test.string

'Elsie'

In [25]:
type(test.string)

bs4.element.NavigableString

In [26]:
test_unicode = str(test.string)
test_unicode

'Elsie'

In [27]:
type(test_unicode)

str

In [28]:
test.string.replace_with("Test Change Text")
test

<a class="sister" href="http://example.com/elsie" id="link1">Test Change Text</a>

In [29]:
test_comment = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup = BeautifulSoup(test_comment, 'html.parser')
comment = soup.b.string
comment

'Hey, buddy. Want to buy a used parser?'

In [30]:
type(comment)

bs4.element.Comment

In [31]:
print(soup.b.prettify())

<b>
 <!--Hey, buddy. Want to buy a used parser?-->
</b>



# **트리탐색 처럼 이동**

In [32]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

In [33]:
soup.find("head")

<head><title>The Dormouse's story</title></head>

In [143]:
soup.head

<title>The Dormouse's story</title>

In [35]:
soup.title

<title>The Dormouse's story</title>

In [36]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [37]:
head_tag = soup.head
head_tag

<head><title>The Dormouse's story</title></head>

In [38]:
head_tag.contents

[<title>The Dormouse's story</title>]

In [39]:
title_tag = head_tag.contents[0]
title_tag

<title>The Dormouse's story</title>

In [40]:
title_tag.contents

["The Dormouse's story"]

In [41]:
len(soup.contents)

2

In [42]:
soup.contents[0].name

In [43]:
soup.contents[1].name

'html'

In [44]:
for child in title_tag.children:
    print(child)

The Dormouse's story


In [45]:
for child in head_tag.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story


In [46]:
len(list(soup.children))

2

In [47]:
len(list(soup.descendants))

27

In [48]:
title_tag.string

"The Dormouse's story"

In [49]:
print(soup.html.string) #하위에 자식이 하나뿐이지 않기 때문에 string으로 무엇을 표현해야 할지 모

None


In [145]:
for string in soup.strings:
    print(repr(string))

"The Dormouse's story"
'\n'
'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'


In [51]:
for string in soup.stripped_strings: #\n을 사용하지 않고도 줄바
    print(repr(string)) #repr는 객체를 다시 만들어 준다.
  #str타입을 객체화 하려하면 오류가 생긴다.

"The Dormouse's story"
"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'


In [52]:
TM = "THIS IS TEST MESSAGE"
str(TM) #사용자가 보기 쉽게 하기 위한 str

'THIS IS TEST MESSAGE'

In [53]:
repr(TM) #개발자가 사용할 수 있도록 하기 위해 객체형식으로 보여줌

"'THIS IS TEST MESSAGE'"

In [54]:
title_tag

<title>The Dormouse's story</title>

In [55]:
title_tag.parent

<head><title>The Dormouse's story</title></head>

In [56]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></a>", 'html.parser')
print(sibling_soup.prettify()) # b와 c는 a안에 있는 태그 즉, 형제 태그

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>



In [57]:
sibling_soup.b.next_sibling #b 다음 태그

<c>text2</c>

In [58]:
sibling_soup.c.previous_sibling #c 이전 태그

<b>text1</b>

In [59]:
print(sibling_soup.b.previous_sibling)

None


In [60]:
print(sibling_soup.c.next_sibling)

None


In [61]:
link = soup.a

In [62]:
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [63]:
link.next_sibling

',\n'

In [64]:
link.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling

';\nand they lived at the bottom of a well.'

In [65]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'


In [66]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


In [67]:
last_a_tag = soup.find("a", id="link3")
last_a_tag

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [68]:
last_a_tag.next_sibling

';\nand they lived at the bottom of a well.'

In [69]:
last_a_tag.next_element #세미콜론으로 구분되있기 때문에 tillie가 나오게 된다.

'Tillie'

In [70]:
for element in last_a_tag.next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'


In [71]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'html.parser')

In [72]:
import re #정규 표현식을 사용하여 html 문서내의 모든 태그를 검색할 수 있다.
for tag in soup.find_all(re.compile("^b")): #b로 시작하는 태그를 찾는다.
    print(tag.name)

body
b


In [73]:
for tag in soup.find_all(re.compile("t")): #t가 들어간 태그를 찾는다.
    print(tag.name)

html
title


In [74]:
for tag in soup.find_all(True): #모든 태그를 찾아주지만, 텍스트 문자열은 차지 못한다.
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p


In [75]:
def has_class_but_no_id(tag):
  return tag.has_attr('class') and not tag.has_attr('id')

In [76]:
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [77]:
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print(tag.name)

body
p
a
a
a
p


In [78]:
#find_all( name , attrs , recursive , string , limit , **kwargs )
#recursive : 함

In [79]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [80]:
soup.find_all(id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [81]:
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [82]:
def not_lacie(href):
    return href and not re.compile("lacie").search(href)

soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [83]:
soup.find_all(class_=re.compile("itl"))

[<p class="title"><b>The Dormouse's story</b></p>]

In [84]:
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [85]:
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser')

In [86]:
data_soup.find_all(data-foo="value")

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-86-34fc3cde6712>, line 1)

In [87]:
data_soup.find_all(attrs={"data-foo": "value"}) #html의 데이터를 저장하는 data-*타입은 일반적인 find_all로 찾을 수 없다.

[<div data-foo="value">foo!</div>]

In [88]:
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
name_soup.find_all(name="email") #name으로도 마찬가지로 찾을 수 없다.

[]

In [89]:
name_soup.find_all(attrs={"name": "email"})

[<input name="email"/>]

In [90]:
soup.find_all("a", class_="sister") #class 속성으로 검색할때는 class_로 검색한다.

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [91]:
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6 #is로 비교하면 객체 자체를 비교 ==으로 비교하면 객체의 요소 비

soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [92]:
import numpy as np
np.array([0.724, 0.724, 0.724]) == None

array([False, False, False])

In [93]:
np.array([0.724, 0.724, 0.724]) is None

False

In [94]:
soup.find_all(string=re.compile("Dormouse")) #문자열로 검색

["The Dormouse's story", "The Dormouse's story"]

In [95]:
def is_the_only_string_within_a_tag(s):
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)

["The Dormouse's story",
 "The Dormouse's story",
 'Elsie',
 'Lacie',
 'Tillie',
 '...']

In [96]:
soup.find_all("a", limit=2) #개수를 정해서 출력

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [97]:
soup.html.find_all("title", recursive=False) #html 바로 밑에서만 검색

[]

In [98]:
soup.html.find_all("title")

[<title>The Dormouse's story</title>]

In [99]:
soup.find_all("a")
soup("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [100]:
soup.title.find_all(string=True)
soup.title(string=True)

["The Dormouse's story"]

In [101]:
#find_all은 리스트로 반환 find는 값만 반환

In [102]:
soup.find_all('title', limit=1)

[<title>The Dormouse's story</title>]

In [103]:
soup.find('title')

<title>The Dormouse's story</title>

In [104]:
soup.css.select("title")

[<title>The Dormouse's story</title>]

In [105]:
soup.css.select("p:nth-of-type(3)") #*:nth-of-type(n) : * 타입의 n번째 요소

[<p class="story">...</p>]

In [106]:
soup.css.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [107]:
soup.css.select("a#link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [108]:
soup.css.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [109]:
soup.css.select("head > title") #>를 통해 직접 접근

[<title>The Dormouse's story</title>]

In [110]:
soup.css.select("html head title") #나열을 통한 접

[<title>The Dormouse's story</title>]

In [111]:
soup.css.select("#link1 ~ .sister") #link1 이후부터 sister과 일치한 태그를 모두 찾음

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [112]:
soup.css.select("#link1 + .sister")#link1 이후부터 sister과 일치한 태그를 하나만 찾음

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [113]:
soup.css.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [114]:
soup.css.select("[class~=sister]")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [115]:
soup.css.select("#link1,#link2")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [116]:
soup.css.select('a[href]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [117]:
soup.select('a[href$="tillie"]') #편의상 css속성을 명시하지 않아도 가능한 모

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [118]:
select_Test = [tag['id'] for tag in soup.css.iselect(".sister")]

In [119]:
type(select_Test)

list

In [120]:
select_Test = [tag['id'] for tag in soup.css.select(".sister")]

In [121]:
type(select_Test)

list

In [122]:
soup.css.escape("1-strange-identifier") #유효하지 않은 CSS 태그들을 반환

'\\31 -strange-identifier'

In [123]:
xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/">
 <ns1:child>I'm in namespace 1</ns1:child>
 <ns2:child>I'm in namespace 2</ns2:child>
</tag> """
namespace_soup = BeautifulSoup(xml, "xml")

namespace_soup.css.select("child")

[<ns1:child>I'm in namespace 1</ns1:child>,
 <ns2:child>I'm in namespace 2</ns2:child>]

In [124]:
print(namespace_soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/">
 <ns1:child>
  I'm in namespace 1
 </ns1:child>
 <ns2:child>
  I'm in namespace 2
 </ns2:child>
</tag>



In [125]:
namespaces = dict(first="http://namespace1/", second="http://namespace2/")
namespace_soup.css.select("second|child", namespaces=namespaces)

[<ns2:child>I'm in namespace 2</ns2:child>]