In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
#!pip install requests
#!pip install bs4

In [4]:
url = 'http://books.toscrape.com/'

request = requests.get(url)

In [5]:
request

<Response [200]>

In [6]:
request.status_code

200

In [7]:
request.content



In [8]:
tree = BeautifulSoup(request.content)

In [17]:
tree.head.title.text.strip()

'All products | Books to Scrape - Sandbox'

In [19]:
tree.p

<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>

In [50]:
dirty_prices = tree.find_all('p', {'class': 'price_color'})

clean_prices = []
for price in dirty_prices:
    clean_price = float(price.text.replace('£', ''))
    clean_prices.append(clean_price)

In [53]:
sum(clean_prices) / len(clean_prices)

38.048500000000004

## list comprehension

In [60]:
clean_prices = [float(price.text[1:]) for price in dirty_prices]

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [64]:
tree.h3.a.text

'A Light in the ...'

In [67]:
tree.h3.a.get('title')

'A Light in the Attic'

In [95]:
titles = [title.a.get('title') for title in tree.find_all('h3')]

In [98]:
clean_prices

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [99]:
titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [101]:
import pandas as pd

In [102]:
d = {'price': clean_prices, 'title': titles}

pd.DataFrame(d)

Unnamed: 0,price,title
0,51.77,A Light in the Attic
1,53.74,Tipping the Velvet
2,50.1,Soumission
3,47.82,Sharp Objects
4,54.23,Sapiens: A Brief History of Humankind
5,22.65,The Requiem Red
6,33.34,The Dirty Little Secrets of Getting Your Dream...
7,17.93,The Coming Woman: A Novel Based on the Life of...
8,22.6,The Boys in the Boat: Nine Americans and Their...
9,52.15,The Black Maria


In [96]:
tree.find_all('h3')

[<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>,
 <h3><a href="catalogue/tipping-the-velvet_999/index.html" title="Tipping the Velvet">Tipping the Velvet</a></h3>,
 <h3><a href="catalogue/soumission_998/index.html" title="Soumission">Soumission</a></h3>,
 <h3><a href="catalogue/sharp-objects_997/index.html" title="Sharp Objects">Sharp Objects</a></h3>,
 <h3><a href="catalogue/sapiens-a-brief-history-of-humankind_996/index.html" title="Sapiens: A Brief History of Humankind">Sapiens: A Brief History ...</a></h3>,
 <h3><a href="catalogue/the-requiem-red_995/index.html" title="The Requiem Red">The Requiem Red</a></h3>,
 <h3><a href="catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" title="The Dirty Little Secrets of Getting Your Dream Job">The Dirty Little Secrets ...</a></h3>,
 <h3><a href="catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/ind

## Все, что было сверху запихиваем в одну простую функцию

In [107]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [110]:
def parse(url):
    # забрали инфу с сайта в виде html дерева
    request = requests.get(url)
    # превратили это html дерево во что-то удобное
    tree = BeautifulSoup(request.content)
    
    # собрали цены из удобного html-дерева
    dirty_prices = tree.find_all('p', {'class': 'price_color'})
    clean_prices = [float(price.text[1:]) for price in dirty_prices]
    # собрали тайтлы
    titles = [title.a.get('title') for title in tree.find_all('h3')]
    # собрали ссылки на книги
    hrefs = [title.a.get('href') for title in tree.find_all('h3')]
    
    # создал словарик с инфой и превратил его в датафрейм
    d = {'price': clean_prices, 'title': titles, 'href': hrefs}

    return pd.DataFrame(d)

In [111]:
url = 'http://books.toscrape.com/'
df = parse(url)

In [112]:
df

Unnamed: 0,price,title,href
0,51.77,A Light in the Attic,catalogue/a-light-in-the-attic_1000/index.html
1,53.74,Tipping the Velvet,catalogue/tipping-the-velvet_999/index.html
2,50.1,Soumission,catalogue/soumission_998/index.html
3,47.82,Sharp Objects,catalogue/sharp-objects_997/index.html
4,54.23,Sapiens: A Brief History of Humankind,catalogue/sapiens-a-brief-history-of-humankind...
5,22.65,The Requiem Red,catalogue/the-requiem-red_995/index.html
6,33.34,The Dirty Little Secrets of Getting Your Dream...,catalogue/the-dirty-little-secrets-of-getting-...
7,17.93,The Coming Woman: A Novel Based on the Life of...,catalogue/the-coming-woman-a-novel-based-on-th...
8,22.6,The Boys in the Boat: Nine Americans and Their...,catalogue/the-boys-in-the-boat-nine-americans-...
9,52.15,The Black Maria,catalogue/the-black-maria_991/index.html


In [135]:
def parse_stocks(href):
    """
    это функция, которая парсит доступность конкретной книги в стоке 
    """
    request = requests.get(href)
    tree = BeautifulSoup(request.content)

    in_stock = tree.find('p', {'class': 'instock availability'}).text.strip()
    
    return in_stock

In [133]:
stocks = []
for href in df.href.values:
    itog_href = 'http://books.toscrape.com/' + href
    
    stock = parse_stocks(itog_href)
    
    stocks.append(stock)

In [138]:
df['in_stock'] = stocks

In [148]:
df

Unnamed: 0,price,title,href,in_stock
0,51.77,A Light in the Attic,catalogue/a-light-in-the-attic_1000/index.html,In stock (22 available)
1,53.74,Tipping the Velvet,catalogue/tipping-the-velvet_999/index.html,In stock (20 available)
2,50.1,Soumission,catalogue/soumission_998/index.html,In stock (20 available)
3,47.82,Sharp Objects,catalogue/sharp-objects_997/index.html,In stock (20 available)
4,54.23,Sapiens: A Brief History of Humankind,catalogue/sapiens-a-brief-history-of-humankind...,In stock (20 available)
5,22.65,The Requiem Red,catalogue/the-requiem-red_995/index.html,In stock (19 available)
6,33.34,The Dirty Little Secrets of Getting Your Dream...,catalogue/the-dirty-little-secrets-of-getting-...,In stock (19 available)
7,17.93,The Coming Woman: A Novel Based on the Life of...,catalogue/the-coming-woman-a-novel-based-on-th...,In stock (19 available)
8,22.6,The Boys in the Boat: Nine Americans and Their...,catalogue/the-boys-in-the-boat-nine-americans-...,In stock (19 available)
9,52.15,The Black Maria,catalogue/the-black-maria_991/index.html,In stock (19 available)


In [None]:
http://books.toscrape.com/catalogue/page-2.html

## Итоговая функция!

In [149]:
def parse(url):
    request = requests.get(url)
    tree = BeautifulSoup(request.content)
    return tree 

In [155]:
import time


def collect_books_info(url):
    
    book_tree = parse(url)
    
    # собрали цены из удобного html-дерева
    dirty_prices = book_tree.find_all('p', {'class': 'price_color'})
    clean_prices = [float(price.text[1:]) for price in dirty_prices]
    
    # собрали тайтлы
    titles = [title.a.get('title') for title in book_tree.find_all('h3')]
    
    # собрали ссылки на книги
    hrefs = [title.a.get('href') for title in book_tree.find_all('h3')]
    
    # собираем доступность книг, переходя на каждую книгу
    stocks = []
    for href in hrefs:
        print(href)
        itog_href = 'http://books.toscrape.com/' + href
        tree = parse(itog_href)
        stock = tree.find('p', {'class': 'instock availability'}).text.strip()
        stocks.append(stock)
        
        time.sleep(2)
        
    # создал словарик с инфой и превратил его в датафрейм
    d = {
        'price': clean_prices, 
        'title': titles, 
        'href': hrefs,
        'stock': stocks
    }

    return pd.DataFrame(d)

In [156]:
url = 'http://books.toscrape.com/'

df = collect_books_info(url)

catalogue/a-light-in-the-attic_1000/index.html
catalogue/tipping-the-velvet_999/index.html
catalogue/soumission_998/index.html
catalogue/sharp-objects_997/index.html
catalogue/sapiens-a-brief-history-of-humankind_996/index.html
catalogue/the-requiem-red_995/index.html
catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html
catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html
catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html
catalogue/the-black-maria_991/index.html
catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html
catalogue/shakespeares-sonnets_989/index.html
catalogue/set-me-free_988/index.html
catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html
catalogue/rip-it-up-and-start-again_986/index.html
catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html

In [157]:
for p in range(2, 3):
    url = f'https://books.toscrape.com/catalogue/page-{p}.html'
    df_new = collect_books_info(url)

in-her-wake_980/index.html


AttributeError: 'NoneType' object has no attribute 'text'

In [154]:
'http://books.toscrape.com/in-her-wake_980/index.html

'https://books.toscrape.com/catalogue/page-2.html'

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [68]:
d = {'max': 6}

In [69]:
d['max']

6

In [70]:
d.get('max')

6

In [71]:
d['ann']

KeyError: 'ann'

In [74]:
d.get('ann')