================================================================================
## #Searching for tags by attributes, working with lists of tags, and navigating parse trees.
================================================================================

In [1]:
import requests

url = 'http://www.pythonscraping.com/pages/warandpeace.html'

try:
    r = requests.get(url)
    r.raise_for_status()
    #print(r.text) #to make it readable
except requests.exceptions.HTTPError as errh:
    print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
    print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
    print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
    print("OOps: Something Else", err)


In [2]:
from bs4 import BeautifulSoup

html = r.text
soup = BeautifulSoup(html, 'html.parser')
#print(soup.find_all('span', {'class':'green'}))

================================================================================
### use the <span style="color:red;font-family:courier"> find_all</span> function to extract a Python list of proper nouns found by selecting only the text within $<span \: \;  class="green"></span>$ tags

```
soup.find_all(tagName, tagAttributes)
```
================================================================================

In [3]:
nameList = soup.find_all('span', {'class':'green'}) # list of all 'span' tags that uses the class 'green'
for tag in nameList:
    print(tag.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


================================================================================
<h3><span style="font-family:courier"> another way to have the same results by using the <span style="color:red;font-family:courier">get_text()</span> methode </span></h3>

```
nameList = soup.findAll('span', {'class':'green'})
for name in nameList:
    print(name.text)
```

<h4><span style="font-family:courier">When to get_text() and When to Preserve Tags: .get_text() strips all tags from the document you are working with and returns a Unicode string containing the text only. For example, if you are working with a large block of text that contains many hyperlinks, paragraphs, and other tags, all those will be stripped away, and you’ll be left with a tagless block of text.
Keep in mind that it’s much easier to find what you’re looking for in a BeautifulSoup object than in a block of text. Calling .get_text() should always be the last thing you do, immediately before you print, store, or manipulate your final data. In general, you should try to preserve the tag structure of a document as long as possible.</span><h4>

================================================================================

In [4]:
# to search for a list of all the header tags in a document
# .find_all(['h1','h2','h3','h4','h5','h6'])
# .find_all('span', {'class':{'green', 'red'}})

# title = soup.find_all(id='title', class_='text') # class_ attribute
#the following two lines are identical:
#    soup.find_all(id='text')
#    soup.find_all('', {'id':'text'})


all_header = soup.find_all(['h1','h2','h3','h4','h5','h6'])
print(all_header)

[<h1>War and Peace</h1>, <h2>Chapter 1</h2>]


In [5]:
#spec_text = soup.find_all(text='the prince')
spec_text = soup.find(text='the prince') #to search for a specific text
print(spec_text)

the prince


In [9]:
# Dealing with children and other descendants

url = 'http://www.pythonscraping.com/pages/page3.html'

r = requests.get(url)

html = r.text

bs = BeautifulSoup(html, 'html.parser')

table_tag = bs.find("table",{"id":"giftList"}) # string of all the tags inside the 'table' tag with the id 'giftList'


# We convert it to a list by splitting the 'tr' tags with comma ','
#If recursive is set to True , the find_all function looks into children, and children’s children, for tags that match the parameters.
#If it is False , it will look only at the top-level tags in the document.
children = table_tag.findChildren('tr', recursive=True) # list of all 'tr' tags

for child in children:
    print(child)
    print(child.text)


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

Item Title

Description

Cost

Image

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>

Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00



<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>

Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite do

In [7]:
# Dealing with children and other descendants

url = 'http://www.pythonscraping.com/pages/page3.html'

r = requests.get(url)

html = r.text

bs = BeautifulSoup(html, 'html.parser')

table_tag = bs.find_all("table",{"id":"giftList"}) # list of all the tags inside the 'table' tag with the id 'giftList'

# split the 'tr' tags with comma ',' and convert it to a list
children = table_tag[0].findChildren('tr', recursive=True) # list of all 'tr' tags
#print(children)

for child in children:
    print(child)
    print(child.text)

<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>

Item Title

Description

Cost

Image

<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>

Vegetable Basket

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
Now with super-colorful bell peppers!

$15.00



<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>

Russian Nesting Dolls

Hand-painted by trained monkeys, these exquisite do

In [46]:
# Dealing with siblings


url = 'http://www.pythonscraping.com/pages/page3.html'

r = requests.get(url)

html = r.text

bs = BeautifulSoup(html, 'html.parser')

table_tag = bs.find("table",{"id":"giftList"}) # list of all the tags inside the 'table' tag with the id 'giftList'

# siblings: to collect the tags which have something in common
children = table_tag.tr.next_siblings

for child in children:
    print(child)




<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr