In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

from ftfy import fix_text
import unicodedata

In [None]:


BASE_URL = "http://books.toscrape.com/catalogue/page-{}.html"
BASE_SITE = "http://books.toscrape.com/catalogue/"

books = []

for page in range(1, 51):  # Loop through all 50 pages
    print(f"Scraping page {page}...")
    url = BASE_URL.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for book in soup.find_all("article", class_="product_pod"):
        # Basic info from listing page
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.strip()
        rating = book.p["class"][1]  
        
        # Book detail page link
        book_url = BASE_SITE + book.h3.a["href"].replace('../../../', '')
        book_resp = requests.get(book_url)
        book_soup = BeautifulSoup(book_resp.text, 'html.parser')

        # Category
        category = book_soup.find("ul", class_="breadcrumb").find_all("li")[2].text.strip()

        # Product description
        desc_tag = book_soup.find("div", id="product_description")
        description = desc_tag.find_next_sibling("p").text.strip() if desc_tag else ""

        # Table info (UPC, stock)
        table_rows = book_soup.find("table", class_="table table-striped").find_all("tr")
        upc = table_rows[0].find("td").text.strip()
        stock_text = table_rows[5].find("td").text.strip()
        
        # Extract stock number (e.g., "In stock (22 available)")
        stock_count = ''.join([c for c in stock_text if c.isdigit()])

        # Availability (from list page, cleaned)
        availability = book.find("p", class_="instock availability").text.strip()

        books.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Stock Count": stock_count,
            "Rating": rating,
            "Category": category,
            "UPC": upc,
            "Description": description
        })

        time.sleep(0.2)  # short delay for detail page requests

    time.sleep(1)  # delay for page requests

# Save to CSV
df = pd.DataFrame(books)
df.to_csv("../scraped_data/books_detailed_dataset.csv", index=False, encoding="utf-8")
print("Scraping complete! Data saved to books_detailed_dataset.csv")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [14]:
df.head()

Unnamed: 0,Title,Price,Availability,Stock Count,Rating,Category,UPC,Description
0,A Light in the Attic,Â£51.77,In stock,22,Three,Poetry,a897fe39b1053632,It's hard to imagine a world without A Light i...
1,Tipping the Velvet,Â£53.74,In stock,20,One,Historical Fiction,90fa61229261140a,"""Erotic and absorbing...Written with starling ..."
2,Soumission,Â£50.10,In stock,20,One,Fiction,6957f44c3847a760,"Dans une France assez proche de la nÃ´tre, un ..."
3,Sharp Objects,Â£47.82,In stock,20,Four,Mystery,e00eb4fd7b871a48,"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,20,Five,History,4165285e1663650f,From a renowned historian comes a groundbreaki...


In [None]:
df.Price.head() 
# issue with the format of the price

0    Â£51.77
1    Â£53.74
2    Â£50.10
3    Â£47.82
4    Â£54.23
Name: Price, dtype: object

In [18]:
# fixing the price format
# Fix mojibake -> strip non-digits -> to float
df["PriceGBP"] = (
    df["Price"]
      .astype(str)
      .str.replace("\xa0", " ", regex=False)                  # kill NBSP
      .str.encode("latin1", "ignore").str.decode("utf-8", "ignore")  # Â£ -> £
      .str.replace(",", "", regex=False)                      # remove thousand sep if any
      .str.replace(r"[^\d.\-]", "", regex=True)               # drop £ and any other symbols
      .replace("", pd.NA)
      .astype(float)
)

In [19]:
df.PriceGBP.head()

0    51.77
1    53.74
2    50.10
3    47.82
4    54.23
Name: PriceGBP, dtype: float64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1000 non-null   object 
 1   Price         1000 non-null   object 
 2   Availability  1000 non-null   object 
 3   Stock Count   1000 non-null   object 
 4   Rating        1000 non-null   object 
 5   Category      1000 non-null   object 
 6   UPC           1000 non-null   object 
 7   Description   1000 non-null   object 
 8   PriceGBP      1000 non-null   float64
dtypes: float64(1), object(8)
memory usage: 70.4+ KB


In [22]:
df.Description.head(20)

0     It's hard to imagine a world without A Light i...
1     "Erotic and absorbing...Written with starling ...
2     Dans une France assez proche de la nÃ´tre, un ...
3     WICKED above her hipbone, GIRL across her hear...
4     From a renowned historian comes a groundbreaki...
5     Patient Twenty-nine.A monster roams the halls ...
6     Drawing on his extensive experience evaluating...
7     "If you have a heart, if you have a soul, Kare...
8     For readers of Laura Hillenbrand's Seabiscuit ...
9     Praise for Aracelis Girmay:"[Girmay's] every l...
10    Since her assault, Miss Annette Chetwynd has b...
11    This book is an important and complete collect...
12    Aaron Ledbetterâs future had been planned ou...
13    Scott Pilgrim's life is totally sweet. He's 23...
14    Punk's raw power rejuvenated rock, but by the ...
15    This is the never-before-told story of the mus...
16    Part fact, part fiction, Tyehimba Jess's much ...
17    Andrew Barger, award-winning author and en

In [26]:
def basic_clean(s):
    if not isinstance(s, str):
        return ""
    s = fix_text(s)                         # fix encoding issues
    s = unicodedata.normalize("NFKC", s)    # normalize unicode (quotes, spaces, etc.)
    s = s.replace("\xa0", " ")              # convert non-breaking spaces to normal spaces
    s = re.sub(r"\s+", " ", s).strip()      # collapse multiple spaces
    return s

df["Description_clean"] = df["Description"].apply(basic_clean)
print(df["Description_clean"].head(20).to_list())



["It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sound

In [28]:
df.Description_clean.head(5)

0    It's hard to imagine a world without A Light i...
1    "Erotic and absorbing...Written with starling ...
2    Dans une France assez proche de la nôtre, un h...
3    WICKED above her hipbone, GIRL across her hear...
4    From a renowned historian comes a groundbreaki...
Name: Description_clean, dtype: object

In [29]:
# Drop the columns
df.drop(columns=["Price", "Description"], inplace=True)

# Save to CSV without the index column
df.to_csv("../preprocessed_data/preprocessed.csv", index=False)


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Title              1000 non-null   object 
 1   Availability       1000 non-null   object 
 2   Stock Count        1000 non-null   object 
 3   Rating             1000 non-null   object 
 4   Category           1000 non-null   object 
 5   UPC                1000 non-null   object 
 6   PriceGBP           1000 non-null   float64
 7   Description_clean  1000 non-null   object 
dtypes: float64(1), object(7)
memory usage: 62.6+ KB
