# Regex

In [0]:
import re

```
[amk] -> a,m, or k
[^amk] -> not a, m, or k
{n} -> match n elements
{n, m} -> greedy match n to m elements
?
```
```
.*(greedy) vs .*?(not greedy)
```
```
re.I -> remove the influence of upper and lower case
re.L -> locale-aware
re.M
re.S -> include all lines
re.U
re.X
```

``` python
re.match() # match the pattern from the beginning, return none if could not match, use () to group sth
```

In [0]:
string = 'Hello world'
result = re.match('[Hlh](e)l(l)o',string)
print(result)
print(result.group(0))
print(result.group(1))
print(result.group(2))
print(result.span())

<_sre.SRE_Match object; span=(0, 5), match='Hello'>
Hello
e
l
(0, 5)


In [0]:
string = 'Hello 1234567 world'
result1 = re.match('^H.*?(\d+).*d',string)
print(result1.group(1))
result2 = re.match('^H.*(\d+).*d',string)
print(result2.group(1))

1234567
7


```python
re.search() # don't have to match from the beginning but only return the first one
```

In [0]:
string = 'Hello world'
result1 = re.match('world',string)
print(result1)
result2 = re.search('world',string)
print(result2)

None
<_sre.SRE_Match object; span=(6, 11), match='world'>


```python
re.findall() # return all
```

In [0]:
string = 'hello Hello'
result1 = re.search('[hH]ello', string)
print(result1)
result2 = re.findall('[hH]ello', string)
print(result2)

<_sre.SRE_Match object; span=(0, 5), match='hello'>
['hello', 'Hello']


```python
re.sub()
re.compile() # turn a string to a regex, can add re.S in it
```

In [0]:
s1 = '12:00'
s2 = '01:00'
s3 = '11:00'
pattern = re.compile('\d{2}:\d{2}')
print(re.sub(pattern,'time',s1))
print(re.sub(pattern,'time',s2))
print(re.sub(pattern,'time',s3))
pattern = '\d{2}:\d{2}'
print(re.sub(pattern,'time',s1))
print(re.sub(pattern,'time',s2))
print(re.sub(pattern,'time',s3))

time
time
time


In [0]:
"""couldn't pass re.S in to compile"""
s1 = '''12:
00'''

pattern = re.compile(r'\d{2}:\d{2}',re.S)
print(re.search(pattern,s1))
pattern = '\d{2}:\d{2}'
print(re.findall(pattern,s1))

None
[]


# XPath

In [0]:
from lxml import etree

read data
```python
html = etree.HTML(text)
result = etree.tostring(html)
```
or
```python
html = etree.parse('./test.html', etree.HTMLPaeser())
```

In [0]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = etree.tostring(html)  ## bytes
print(result.decode('utf-8'))  ## string

<html><body><div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </li></ul>
 </div>
</body></html>


```
/ -> children
// -> children and grand children
. -> current path
.. -> parent path
@ -> select attribute
```

In [0]:
result = html.xpath('//*'); print(result)
result = html.xpath('/*'); print(result)
result = html.xpath('//li'); print(result[0])
result = html.xpath('//li/a'); print(result)
result = html.xpath('//ul//a'); print(result)
result = html.xpath('//ul/a'); print(result)
print("--------parent---------")
result = html.xpath('//a[@href="link4.html"]/../@class'); print(result)
result = html.xpath('//a[@href="link4.html"]/parent::*/@class'); print(result)
print("--------text---------")
result = html.xpath('//li[@class="item-0"]/text()'); print(result)
result = html.xpath('//li[@class="item-0"]//text()'); print(result)

[<Element html at 0x7f49be582208>, <Element body at 0x7f49be518288>, <Element div at 0x7f49be5181c8>, <Element ul at 0x7f49be518dc8>, <Element li at 0x7f49be518e48>, <Element a at 0x7f49be518b88>, <Element li at 0x7f49be518ec8>, <Element a at 0x7f49be518f08>, <Element li at 0x7f49be518f48>, <Element a at 0x7f49be518e88>, <Element li at 0x7f49be518f88>, <Element a at 0x7f49be518fc8>, <Element li at 0x7f49be525048>, <Element a at 0x7f49be525088>]
[<Element html at 0x7f49be582208>]
<Element li at 0x7f49be518e48>
[<Element a at 0x7f49be518b88>, <Element a at 0x7f49be518f08>, <Element a at 0x7f49be518e88>, <Element a at 0x7f49be518fc8>, <Element a at 0x7f49be525088>]
[<Element a at 0x7f49be518b88>, <Element a at 0x7f49be518f08>, <Element a at 0x7f49be518e88>, <Element a at 0x7f49be518fc8>, <Element a at 0x7f49be525088>]
[]
--------parent---------
['item-1']
['item-1']
--------text---------
['\n     ']
['first item', 'fifth item', '\n     ']


**get attruibutes**

In [0]:
text = '''  
<li class="li li-first" name="item"><a href="link.html">first item</a></li>  
'''  
html = etree.HTML(text)  
result = html.xpath('//a[@href="link.html"]/../@class'); print(result)
result = html.xpath('//li[contains(@class, "li")]/a/text()'); print(result)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()'); print(result)
result = html.xpath('//li[contains(@class, "li") and contains(@name, "item")]/a/text()'); print(result)

['li li-first']
['first item']
['first item']
['first item']


**position starts from 1 (not 0)!!!**
```
[number]
position()
last()
last()-2
```

In [0]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html">first item</a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()'); print(result)
result = html.xpath('//li[last()]/a/text()'); print(result)
result = html.xpath('//li[position()<3]/a/text()'); print(result)
result = html.xpath('//li[last()-2]/a/text()'); print(result)

['first item']
['fifth item']
['first item', 'second item']
['third item']


In [0]:
text = '''
<div>
    <ul>
         <li class="item-0"><a href="link1.html"><span>first item</span></a></li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-inactive"><a href="link3.html">third item</a></li>
         <li class="item-1"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a>
     </ul>
 </div>
'''
html = etree.HTML(text)

result = html.xpath('//li[1]/ancestor::*'); print(result)
result = html.xpath('//li[1]/ancestor::div'); print(result)

print("------get all attributes of current node------")
result = html.xpath('//li[1]/attribute::*'); print(result)

result = html.xpath('//li[1]/child::a[@href="link1.html"]'); print(result)
result = html.xpath('//li[1]/descendant::span'); print(result)
print("------get the second one among all following nodes-------")
result = html.xpath('//li[1]/following::*[2]'); print(result)
print("------get the following siblings-------")
result = html.xpath('//li[1]/following-sibling::*'); print(result)

[<Element html at 0x7f49be417e88>, <Element body at 0x7f49bf79d188>, <Element div at 0x7f49be417d48>, <Element ul at 0x7f49be408e08>]
[<Element div at 0x7f49be417d48>]
------get all attributes of current node------
['item-0']
[<Element a at 0x7f49be417d48>]
[<Element span at 0x7f49be408e08>]
------get the second one among all following nodes-------
[<Element a at 0x7f49be417d48>]
------get the following siblings-------
[<Element li at 0x7f49bf79d188>, <Element li at 0x7f49be501c08>, <Element li at 0x7f49be501f08>, <Element li at 0x7f49be4375c8>]


# Beautiful Soup
**LXML Parser**

In [0]:
from bs4 import BeautifulSoup

In [0]:
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title" name="dromouse">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    <!-- Elsie -->
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>
The Dormouse's story


In [0]:
print(soup.title)
print(type(soup.title))
print(soup.title.string)
print(soup.head)
print(soup.p) # return the first p tag

<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
The Dormouse's story
<head><title>The Dormouse's story</title></head>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>


**attributs is a dictionary**

In [0]:
print(soup.title.name)
print(soup.p.attrs)
print(soup.p.attrs['name'])
# or without attrs
print(soup.p['name'])
print(soup.p['class'])

title
{'class': ['title'], 'name': 'dromouse'}
dromouse
dromouse
['title']


In [0]:
print(soup.p.string)
print("-----vs-----")
print(soup.p.contents)

The Dormouse's story
-----vs-----
[<b>The Dormouse's story</b>]


In [0]:
print(soup.head.title)
print(type(soup.head.title))
print("-----type-----")
print(soup.head.title.string)

<title>The Dormouse's story</title>
<class 'bs4.element.Tag'>
-----type-----
The Dormouse's story


In [0]:
for i, child in enumerate(soup.p.children):
    print(i, child)
print("-----child vs descendant-----")
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
    print(i, child)

0 <b>The Dormouse's story</b>
-----child vs descendant-----
<generator object descendants at 0x7f34cb146728>
0 <b>The Dormouse's story</b>
1 The Dormouse's story


In [0]:
print("-----parent-----")
print(soup.a.parent)
print("------type------")
print(type(soup.a.parents))
print("-----parents-----")
print(list(enumerate(soup.a.parents)))

-----parent-----
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
------type------
<class 'generator'>
-----parents-----
[(0, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>), (1, <body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://ex

In [0]:
print(soup.a)
print('Next Sibling', soup.a.next_sibling)
print('Prev Sibling', soup.a.previous_sibling) # tag a is incide tag p, so the text in tag p is the sibling of a
print('Prev Sibling', soup.a.previous_sibling.string)
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))

<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>
Next Sibling ,

Prev Sibling Once upon a time there were three little sisters; and their names were

Prev Sibling Once upon a time there were three little sisters; and their names were

Next Siblings [(0, ',\n'), (1, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>), (2, ' and\n'), (3, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>), (4, ';\nand they lived at the bottom of a well.')]
Prev Siblings [(0, 'Once upon a time there were three little sisters; and their names were\n')]


In [0]:
print('Parent:')
print(type(soup.a.parents))
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])

Parent:
<class 'generator'>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
['story']


```python
find_all(name , attrs , recursive , text , **kwargs)
```

In [0]:
print(soup.find_all(name='a'))
print(type(soup.find_all(name='a')[0]))

[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
<class 'bs4.element.Tag'>


In [0]:
for p in soup.find_all(name='p'):
    print(p.find_all(name='a'))
    for a in p.find_all(name='a'):
        print(a.string)
print("-----use attribute dictionary-----")
for p in soup.find_all(attrs={'class': 'story'}):
    print(p.find_all(attrs={'class': 'sister'}))
    for a in p.find_all(attrs={'class': 'sister'}):
        print(a.string)

[]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 Elsie 
Lacie
Tillie
[]
-----use attribute dictionary-----
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
 Elsie 
Lacie
Tillie
[]


**for some frequently used attributes like id and class**

In [0]:
print(soup.find_all(id='link1'))
print(soup.find_all(class_='sister'))

[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>]
[<a class="sister" href="http://example.com/elsie" id="link1"><!-- Elsie --></a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


**find return sigle one, find_all return all**

In [0]:
import re
print(soup.find_all(text=re.compile('sisters')))

['Once upon a time there were three little sisters; and their names were\n']


# CSS selector along with bs4

In [0]:
html='''
<div class="panel">
    <div class="panel-heading">
        <h4>Hello</h4>
    </div>
    <div class="panel-body">
        <ul class="list" id="list-1">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
            <li class="element">Jay</li>
        </ul>
        <ul class="list list-small" id="list-2">
            <li class="element">Foo</li>
            <li class="element">Bar</li>
        </ul>
    </div>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

```
. -> class
# -> id
or only tag name
```

In [0]:
print("-----class-----")
print(soup.select('.panel .panel-heading'))
print()
print("------tag-------")
print(soup.select('ul li'))
print()
print("----id and class----")
print(soup.select('#list-2 .element'))
print()
print("----type of element----")
print(type(soup.select('ul')[0]))

-----class-----
[<div class="panel-heading">
<h4>Hello</h4>
</div>]

------tag-------
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

----id and class----
[<li class="element">Foo</li>, <li class="element">Bar</li>]

----type of element----
<class 'bs4.element.Tag'>


In [0]:
print(soup.select('li'))
print("-----loop-----")
for ul in soup.select('ul'):
    print(ul.select('li'))

[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]
-----loop-----
[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
[<li class="element">Foo</li>, <li class="element">Bar</li>]


In [0]:
print("attributes of a tag")
for ul in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])

attributes of a tag
list-1
list-1
list-2
list-2


In [0]:
print("Two ways to get text")
for li in soup.select('li'):
    print('Get Text:', li.get_text())
    print('String:', li.string)

Two ways to get text
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
Get Text: Jay
String: Jay
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar


# PyQuery

In [0]:
# !pip install pyquery
from pyquery import PyQuery as pq                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

**obatain a pq object**

In [8]:
print("-----text-----")
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
doc = pq(html)
print(doc('li'))

print("-----url-----")
doc = pq(url='http://cuiqingcai.com')
print(doc('title'))

print("-----requests-----")
import requests
doc = pq(requests.get('http://cuiqingcai.com').text)
print(doc('title'))

print("-----filename-----")
# doc = pq(filename='demo.html')


-----text-----
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
-----url-----
<title>静觅丨崔庆才的个人博客</title>&#13;

-----requests-----
<title>静觅丨崔庆才的个人博客</title>&#13;

-----filename-----


In [0]:
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
doc = pq(html)

**Based on CSS selector**

In [10]:
print(doc('#container .list li'))
print(type(doc('#container .list li')))

<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<class 'pyquery.pyquery.PyQuery'>


In [11]:
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)

<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     


In [12]:
lis = items.children(); print(lis)
print("-----specific children-----")
lis = items.children('.active'); print(lis)

<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
-----specific children-----
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         


In [13]:
items = doc('.list')
container = items.parent(); print(type(container)); print(container)
print("-----paren† vs parents-----")
parents = items.parents(); print(type(parents)); print(parents)

<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
-----paren† vs parents-----
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>


In [14]:
li = doc('.list .item-0.active'); print(li.siblings())
print("-----specific sibling-----")
print(li.siblings('.active'))

<li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0">first item</li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
-----specific sibling-----
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
         


In [15]:
lis = doc('li').items()
print(type(lis))
for li in lis:
    print(li, type(li))

<class 'generator'>
<li class="item-0">first item</li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
      <class 'pyquery.pyquery.PyQuery'>


In [16]:
a = doc('.item-0.active a')
print(a, type(a))
print(a.attr('href'))

print(a.attr.href)


<a href="link3.html"><span class="bold">third item</span></a> <class 'pyquery.pyquery.PyQuery'>
link3.html
link3.html


In [17]:
a = doc('a')
print(a, type(a))
print(a.attr('href'))
print(a.attr.href)

<a href="link2.html">second item</a><a href="link3.html"><span class="bold">third item</span></a><a href="link4.html">fourth item</a><a href="link5.html">fifth item</a> <class 'pyquery.pyquery.PyQuery'>
link2.html
link2.html


In [18]:
a = doc('a')
for item in a.items():
    print(item.attr('href'))

link2.html
link3.html
link4.html
link5.html


In [19]:
a = doc('.item-0.active a')
print(a)
print(a.text())
print(type(a.text()))

<a href="link3.html"><span class="bold">third item</span></a>
third item
<class 'str'>


In [20]:
li = doc('.item-0.active')
print(li)
print(li.html())

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<a href="link3.html"><span class="bold">third item</span></a>


In [21]:
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         


In [22]:
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.text('changed item')
print(li)
li.html('<span>changed item</span>')
print(li)

<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 active" name="link">changed item</li>
         
<li class="item-0 active" name="link"><span>changed item</span></li>
         


**remove**

In [0]:
html = '''
<div class="wrap">
    Hello, World
    <p>This is a paragraph.</p>
 </div>
'''
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())

**CSS: other function**

In [0]:
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)