# Challenge


> Extract the following elements from the 1st page of books.toscrape.com:
* full book title
* price as float
* rating as int
> Data should be stored as python list of dictionaries, where each book is dictionary
> e.g.
{
'title': 'Mesaerion: The Best Science Fiction Stories 1800-1849',
'price': 37.59,
'rating': 1
}

In [119]:
import requests
from bs4 import BeautifulSoup

In [120]:
url = "https://books.toscrape.com/"

In [121]:
%pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [122]:
resp = requests.get(url)
soup = BeautifulSoup(resp.content)

In [123]:
result=list()

In [124]:
def text_to_number(text):
    # Dictionary to map words to numbers
    word_to_num = {
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5
    }
    
    # Convert the text to lowercase and return the corresponding number
    return word_to_num.get(text.lower(), "Invalid input")

In [125]:
products = soup.find_all("article", attrs={"class": "product_pod"})
for product in products:
    # title = product.h3.a.attrs['title']
    title = product.find("h3").find("a")['title']
    price = float(product.find("p", attrs={"class": "price_color"}).get_text()[1:])
    # rating = text_to_number(product.find("p", attrs={"class": "star-rating"}).attrs['class'][1])
    rating = text_to_number(product.find("p", attrs={"class": "star-rating"})['class'][-1])
    result.append({
        'title': title,
        'price': price,
        'rating': rating
    })

In [126]:
result[10]

{'title': 'Starving Hearts (Triangular Trade Trilogy, #1)',
 'price': 13.99,
 'rating': 2}

In [127]:
product = soup.find("article", attrs={"class": "product_pod"})
product.find("h3").find("a").attrs['title']

'A Light in the Attic'

In [128]:
import re

def clean_price(price):
    return float(re.sub("[^0-9.]","", price))

clean_price(product.find("p", attrs={"class": "price_color"}).get_text()[1:])

51.77

# An extra - Pandas

In [129]:
import random

In [130]:
book_data = [data for data in result]

In [131]:
print(random.choice(book_data))

{'title': 'Sharp Objects', 'price': 47.82, 'rating': 4}


In [132]:
# Total price for all books
sum([book['price'] for book in book_data])

760.97

In [133]:
# average price of all books
sum([book['price'] for book in book_data])/len(book_data)

38.048500000000004

In [134]:
# find titles of all books with price less than 20
[book['title'] for book in book_data if book['price']<20]

['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 'Set Me Free']

In [136]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.1.1-cp312-cp312-macosx_14_0_x86_64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl (12.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m31m11.6 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading numpy-2.1.1-cp312-cp312-macosx_14_0_x86_64.whl (6.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m2 MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
Downloading tzdata-2024.1-py2.py3-non

In [149]:
import pandas as pd

In [150]:
df = pd.DataFrame(book_data)

In [151]:
df

Unnamed: 0,title,price,rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [152]:
df.price.mean()

np.float64(38.048500000000004)

In [153]:
df.price.sum()

np.float64(760.97)

In [154]:
df[df.price<20]

Unnamed: 0,title,price,rating
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
10,"Starving Hearts (Triangular Trade Trilogy, #1)",13.99,2
12,Set Me Free,17.46,5


In [156]:
df.to_csv("book_data.csv", index=False)

In [161]:
df.to_json("book_data.json", orient="records")