# Open Library Web Scrapping for 500 Books

### Imports

In [1]:
import re
import time
import random
from urllib.parse import urljoin

import requests
import pandas as pd
from bs4 import BeautifulSoup

### Configuration

In [2]:
BASE = "https://openlibrary.org"
BASE_URL = "https://openlibrary.org/search?q=subject_key%3A%22awards%22&sort=new"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
    "Accept-Language": "en-US,en;q=0.9",
}

SLEEP_SEARCH_PAGES = (1.0, 2.0)
SLEEP_BOOK_PAGES = (2.0, 4.0)

MAX_RETRIES = 8
BASE_BACKOFF = 2.0

session = requests.Session()
session.headers.update(HEADERS)

### Helper functions

In [3]:
def clean_text(text):
    if text is None:
        return None
    text = re.sub(r"\s+", " ", str(text)).strip()
    return text if text else None


def fix_cover(src):
    """
    Converts cover URLs to an absolute URL.
    Handles:
    - //covers...
    - /covers...
    - /images/icons/avatar_book-sm.png
    """
    if not src:
        return None
    if src.startswith("//"):
        return "https:" + src
    return urljoin(BASE, src)

### Safe request function (handles 429)

In [4]:
def get_soup(url, sleep_range):
    for attempt in range(MAX_RETRIES):
        time.sleep(random.uniform(*sleep_range))
        r = session.get(url, timeout=30)

        if r.status_code == 200:
            return BeautifulSoup(r.text, "html.parser")

        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            if retry_after and retry_after.isdigit():
                wait = int(retry_after)
            else:
                wait = BASE_BACKOFF * (2 ** attempt) + random.uniform(0.5, 1.5)

            print(f"429 Too Many Requests. Waiting {wait:.1f}s then retrying...")
            time.sleep(wait)
            continue

        r.raise_for_status()

    raise RuntimeError(f"Failed after {MAX_RETRIES} retries: {url}")

### Search page: robust edition count extractor

In [5]:
def extract_edition_count(item_soup):
    """
    Extracts edition count from the search result card text:
    e.g. 'First published in 2022 — 2 editions'
    """
    txt = item_soup.get_text(" ", strip=True)
    m = re.search(r"\b(\d+)\s+editions?\b", txt, flags=re.I)
    return int(m.group(1)) if m else None


### Book page scraper

In [7]:
def get_publisher_from_omniline(soup):
    """
    Publisher is inside the omniline block, but may be:
    - a link <a>Publisher Name</a>
    - or plain text in a <span>
    We locate the 'Publisher' block and read the next value.
    """
    omniline = soup.select_one("div.edition-omniline")
    if not omniline:
        return None

    for item in omniline.select("div.edition-omniline-item"):
        label = item.select_one("div")
        if not label:
            continue

        if clean_text(label.get_text(strip=True)).lower() == "publisher":
            # prefer link
            a = item.select_one("a")
            if a:
                return clean_text(a.get_text(" ", strip=True))

            # otherwise any span/text after label
            span = item.select_one("span")
            if span:
                return clean_text(span.get_text(" ", strip=True))

            # last fallback: all text inside the block
            parts = [clean_text(x) for x in item.stripped_strings]
            parts = [p for p in parts if p and p.lower() != "publisher"]
            return clean_text(" ".join(parts)) if parts else None

    return None


def scrape_book_page(book_url):
    soup = get_soup(book_url, sleep_range=SLEEP_BOOK_PAGES)

    publish_date = clean_text(
        soup.select_one('span[itemprop="datePublished"]') and soup.select_one('span[itemprop="datePublished"]').get_text(strip=True)
    )

    language = clean_text(
        soup.select_one('span[itemprop="inLanguage"]') and soup.select_one('span[itemprop="inLanguage"]').get_text(strip=True)
    )

    pages = clean_text(
        soup.select_one('span[itemprop="numberOfPages"]') and soup.select_one('span[itemprop="numberOfPages"]').get_text(strip=True)
    )

    publisher = get_publisher_from_omniline(soup)

    # Subjects -> one cell
    subjects = []
    for a in soup.select('a[href^="/subjects/"]'):
        t = clean_text(a.get_text(" ", strip=True))
        if t:
            subjects.append(t)

    # Deduplicate while keeping order
    seen = set()
    subjects_unique = []
    for s in subjects:
        if s not in seen:
            seen.add(s)
            subjects_unique.append(s)

    return {
        "publish_date": publish_date,
        "publisher": publisher,
        "language": language,
        "pages": pages,
        "subjects": ", ".join(subjects_unique) if subjects_unique else None,
    }

### Scrape one search results page

In [8]:
def scrape_search_page(page_num):
    page_url = f"{BASE_URL}&page={page_num}"
    soup = get_soup(page_url, sleep_range=SLEEP_SEARCH_PAGES)

    rows = []
    for item in soup.select(".searchResultItem"):
        title_a = item.select_one("div.resultTitle h3.booktitle a")
        if not title_a:
            continue

        title = clean_text(title_a.get_text(strip=True))
        book_url = urljoin(BASE, title_a.get("href", ""))

        authors = [clean_text(a.get_text(strip=True)) for a in item.select("span.bookauthor a")]
        author = ", ".join([a for a in authors if a]) if authors else None

        edition_count = extract_edition_count(item)

        cover_img = item.select_one("span.bookcover img")
        cover_url = fix_cover(cover_img.get("src") if cover_img else None)

        try:
            extra = scrape_book_page(book_url)
        except Exception as e:
            print(f"Book page failed: {book_url}\nError: {e}")
            extra = {"publish_date": None, "publisher": None, "language": None, "pages": None, "subjects": None}

        rows.append({
            "title": title,
            "author": author,
            "edition_count": edition_count,
            "cover_url": cover_url,
            "book_url": book_url,
            **extra
        })

    return rows

### Batch runner (5 pages at a time)

In [9]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv"):
    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)
        if not page_rows:
            print("No rows found; stopping early.")
            break
        all_rows.extend(page_rows)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df

### Batch 1 (pages 1–5)

In [10]:
df_01_05 = run_batch(start_page=1, pages_per_run=5, out_file="batch_01_05.csv")
df_01_05.head()

Scraping pages 1 to 5
  Search page 1
  Search page 2
  Search page 3
  Search page 4
  Search page 5
Saved 100 rows to batch_01_05.csv


Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,Politics of Literary Prestige: Prizes and Span...,Sarah E. L. Bowskill,2,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL25809312W/Poli...,2022,Bloomsbury Publishing Plc,English,240,"Spanish American literature, Awards, Political..."
1,Symposium Städtebau revisited: Preise - Praxis...,Christina Simon-Philipp,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32038765W/Symp...,2022,Wasmuth & Zohlen,German,135,"City planning, Congresses, Awards, German Arch..."
2,NTAA 2022 - New Technological Art Award 2022: ...,Haseeb Ahmed,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32026243W/NTAA...,2022,Artha - art & heritage books,English,127,"Art and technology, Exhibitions, Art, Awards, ..."
3,"Deutsche Bank ""Artists of the Year"" 2021: Maxw...",Maxwell Alexandre,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31912905W/Deut...,2022,Kerber Verlag,English,227,"Modern Art, Exhibitions, Art, Awards, Maxwell ..."
4,"MAXXI Bulgari Prize 2022: Alessandra Ferrini, ...",Giulia Ferracci,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31884256W/MAXX...,2022,Corraini edizioni,Italian,163,"Maxxi Bulgari Prize, Exhibitions, Art, Awards,..."


### Adding a longer “cooldown” between search pages

In [13]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv", page_cooldown=(20, 40)):
    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)
        if not page_rows:
            print("No rows found; stopping early.")
            break

        all_rows.extend(page_rows)

        # Cooldown after each search page (very important)
        sleep_s = random.uniform(*page_cooldown)
        print(f"  Cooling down for {sleep_s:.1f} seconds...")
        time.sleep(sleep_s)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df


### Batch 2 (pages 6–10)

In [14]:
df_06_10 = run_batch(6, 5, "batch_06_10.csv", page_cooldown=(25, 45))
df_06_10.head()

Scraping pages 6 to 10
  Search page 6
  Cooling down for 39.9 seconds...
  Search page 7
  Cooling down for 30.7 seconds...
  Search page 8
  Cooling down for 36.7 seconds...
  Search page 9
  Cooling down for 43.7 seconds...
  Search page 10
  Cooling down for 38.6 seconds...
Saved 100 rows to batch_06_10.csv


Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,CyberArts 2019,"Hannes Leopoldseder, Christine Schöpf, Gerfrie...",1,https://covers.openlibrary.org/b/id/13794701-M...,https://openlibrary.org/works/OL25222793W/Cybe...,2019,Hatje Cantz Verlag GmbH & Co KG,English,256.0,"Computer art, Awards, Computer animation, Tech..."
1,The Coretta Scott King Awards: 50th Anniversary,"Carole J. McCollough, Adelaide Poniatowski Phelps",1,https://covers.openlibrary.org/b/id/10837117-M...,https://openlibrary.org/works/OL24318976W/The_...,"Jun 03, 2019",ALA Editions,,248.0,"American literature, african american authors,..."
2,Complex Management Systems and the Shingo Mode...,Gerhard J. Plenert,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL25746999W/Comp...,2019,Taylor & Francis Group,English,,"Production management, Lean manufacturing, Man..."
3,Not Just Your Face Honey,"Stefanie Moshammer, Andreas Prinzing",1,https://covers.openlibrary.org/b/id/13909182-M...,https://openlibrary.org/works/OL25634324W/Not_...,2019,"Dreen, Markus, Anne König u. Jan Wenzel. Spect...",English,144.0,"Exhibitions, Artistic Photography, Pictorial w..."
4,Of African objects in Western museums: award o...,Achille Mbembe,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32570289W/Of_A...,2019,Rhema,German,114.0,"African Art objects, Museums, Collection manag..."


### Batch 3 (pages 11–15)

In [None]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv", page_cooldown=(20, 40)):
    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)
        if not page_rows:
            print("No rows found; stopping early.")
            break

        all_rows.extend(page_rows)

        # Cooldown after each search page (very important)
        sleep_s = random.uniform(*page_cooldown)
        print(f"  Cooling down for {sleep_s:.1f} seconds...")
        time.sleep(sleep_s)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df


In [19]:
df_11_15 = run_batch(11, 5, "batch_11_15.csv",
                     page_cooldown=(30, 60),
                     batch_cooldown=(120, 180))


Scraping pages 11 to 15
  Search page 11
429 Too Many Requests. Waiting 3.5s then retrying...
429 Too Many Requests. Waiting 3.4s then retrying...
429 Too Many Requests. Waiting 2.8s then retrying...
429 Too Many Requests. Waiting 2.6s then retrying...
  Cooling down for 36.0 seconds...
  Search page 12
429 Too Many Requests. Waiting 3.5s then retrying...
429 Too Many Requests. Waiting 3.4s then retrying...
429 Too Many Requests. Waiting 4.9s then retrying...
  Cooling down for 36.8 seconds...
  Search page 13
429 Too Many Requests. Waiting 3.0s then retrying...
429 Too Many Requests. Waiting 2.6s then retrying...
429 Too Many Requests. Waiting 5.4s then retrying...
  Cooling down for 38.0 seconds...
  Search page 14
429 Too Many Requests. Waiting 3.2s then retrying...
429 Too Many Requests. Waiting 5.4s then retrying...
429 Too Many Requests. Waiting 2.6s then retrying...
429 Too Many Requests. Waiting 4.6s then retrying...
429 Too Many Requests. Waiting 2.8s then retrying...
429 Too 

Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,BMS '19: Bakat Muda Sezaman = Young contempora...,Balai Seni Lukis Negara (Malaysia),1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31849350W/BMS_...,2018,Balai Seni Negara,English,334.0,"Malaysian Art, Exhibitions, Modern Art, Art, C..."
1,Premio Alberto J. Trabucco 2018: dibujo,Argentina) Premio Alberto J. Trabucco. D...,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31787312W/Prem...,2018,Fundación Alberto J. Trabucco,Spanish,40.0,"Exhibitions, Drawing, Awards, Argentine Drawin..."
2,Young Talent Architecture Award 2016,"Ivan Blasi, Anna Sala Giralt",1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32614098W/Youn...,2018,Fundació Mies van der Rohe,English,171.0,"Architecture, History, Designs and plans, Dome..."
3,Young Talent Architectture Award 2018,"Ivan Blasi, Anna Sala Giralt",1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32614099W/Youn...,2018,Fundació Mies van der Rohe,English,203.0,"Architecture, History, Designs and plans, Dome..."
4,Prix Littéraires du Gouverneur Général du Cana...,"Andrew David Irvine, Edmond Rivère, Stephanie ...",3,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL26498389W/Prix...,2018,University of Ottawa Press/Les Presses de l'Un...,French,,"Canadian literature, bibliography, Awards"


### Batch 4 (pages 16–20)

In [22]:
SLEEP_SEARCH_PAGES = (2.0, 4.0)
SLEEP_BOOK_PAGES = (6.0, 10.0)

MAX_RETRIES = 12
BASE_BACKOFF = 15.0

In [23]:
def get_soup(url, sleep_range):
    for attempt in range(MAX_RETRIES):
        time.sleep(random.uniform(*sleep_range))
        r = session.get(url, timeout=30)

        if r.status_code == 200:
            return BeautifulSoup(r.text, "html.parser")

        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")

            if retry_after and retry_after.isdigit():
                wait = int(retry_after) + random.uniform(2, 6)
            else:
                # strong exponential backoff
                wait = BASE_BACKOFF * (2 ** attempt) + random.uniform(3, 10)

            print(f"429 Too Many Requests. URL: {url}")
            print(f"Waiting {wait:.1f} seconds, then retrying (attempt {attempt+1}/{MAX_RETRIES})...")
            time.sleep(wait)
            continue

        r.raise_for_status()

    raise RuntimeError(f"Failed after {MAX_RETRIES} retries: {url}")


In [24]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv",
              page_cooldown=(25, 45), batch_cooldown=(60, 120)):

    # cooldown before starting the batch
    pre_sleep = random.uniform(*batch_cooldown)
    print(f"Pre-batch cooldown: sleeping {pre_sleep:.1f} seconds...")
    time.sleep(pre_sleep)

    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)

        if not page_rows:
            print("No rows found; stopping early.")
            break

        all_rows.extend(page_rows)

        sleep_s = random.uniform(*page_cooldown)
        print(f"  Cooling down for {sleep_s:.1f} seconds...")
        time.sleep(sleep_s)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df

In [25]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv", page_cooldown=(20, 40)):
    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)
        if not page_rows:
            print("No rows found; stopping early.")
            break

        all_rows.extend(page_rows)

        # Cooldown after each search page (very important)
        sleep_s = random.uniform(*page_cooldown)
        print(f"  Cooling down for {sleep_s:.1f} seconds...")
        time.sleep(sleep_s)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df


In [26]:
df_16_20 = run_batch(16, 5, "batch_16_20.csv", page_cooldown=(25, 45))
df_16_20.head()

Scraping pages 16 to 20
  Search page 16
429 Too Many Requests. URL: https://openlibrary.org/search?q=subject_key%3A%22awards%22&sort=new&page=16
Waiting 21.9 seconds, then retrying (attempt 1/12)...
429 Too Many Requests. URL: https://openlibrary.org/works/OL32474810W/Gibore_Yiśraʼel_meḳable_tsiyun_le-shevaḥ_shel_ha-aluf?edition=key%3A/books/OL44260506M
Waiting 21.8 seconds, then retrying (attempt 1/12)...
429 Too Many Requests. URL: https://openlibrary.org/works/OL21615507W/Die_Deutschen_Friedensnobelpreiskandidaten_Im_Kaiserreich_1901-1918?edition=key%3A/books/OL29365344M
Waiting 19.4 seconds, then retrying (attempt 1/12)...
429 Too Many Requests. URL: https://openlibrary.org/works/OL26750281W/Interviews_with_the_Abel_Prize_Laureates_2003-2016?edition=key%3A/books/OL36226573M
Waiting 18.3 seconds, then retrying (attempt 1/12)...
429 Too Many Requests. URL: https://openlibrary.org/works/OL26750281W/Interviews_with_the_Abel_Prize_Laureates_2003-2016?edition=key%3A/books/OL36226573M
Wa

Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,"עיר של מתנדבים: זוכי הפרס על שם זאב גלר, על עש...",Moshe Gromb,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL43495275W/עיר_...,2017,נדב,Hebrew,190,"Volunteers, Awards, Zeʼev Geler, Israel, Kefar..."
1,Nicolas-Born-Preis 2017,"Franzobel, Julia Wolf, Alexander Kos enina, Lo...",1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL43770152W/Nico...,2017,Wehrhahn Verlag,German,47,"Awards, Literary prizes, Franzobel (1967-), Ju..."
2,Gibore Yiśraʼel meḳable tsiyun le-shevaḥ shel ...,Offer Drori,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32474810W/Gibo...,2017,Deror,Hebrew,420,"Medals, badges, decorations, Biography, Armed ..."
3,Die Deutschen Friedensnobelpreiskandidaten Im ...,Thomas Sirges,4,https://covers.openlibrary.org/b/id/13603453-M...,https://openlibrary.org/works/OL21615507W/Die_...,2017,"Lang GmbH, Internationaler Verlag der Wissensc...",German,356,"Nobel prizes, Germany, biography, Nobel Prize ..."
4,Young Architects 17: Authenticity,"Architectural League of New York, Meredith, Mi...",1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL29038649W/Youn...,2017,"Monfried Editions, Andrea",English,176,"Architecture, united states, Architects, Exhib..."


### Batch 5 (pages 21–25)

In [28]:
def run_batch(start_page, pages_per_run=5, out_file="batch.csv", page_cooldown=(20, 40)):
    end_page = start_page + pages_per_run - 1
    print(f"Scraping pages {start_page} to {end_page}")

    all_rows = []
    for page in range(start_page, end_page + 1):
        print(f"  Search page {page}")
        page_rows = scrape_search_page(page)
        if not page_rows:
            print("No rows found; stopping early.")
            break

        all_rows.extend(page_rows)

        # Cooldown after each search page (very important)
        sleep_s = random.uniform(*page_cooldown)
        print(f"  Cooling down for {sleep_s:.1f} seconds...")
        time.sleep(sleep_s)

    df = pd.DataFrame(all_rows).drop_duplicates(subset="book_url").reset_index(drop=True)
    df.to_csv(out_file, index=False)
    print(f"Saved {len(df)} rows to {out_file}")
    return df


In [29]:
df_21_25 = run_batch(21, 5, "batch_21_25.csv", page_cooldown=(25, 45))
df_21_25.head()

Scraping pages 21 to 25
  Search page 21
  Cooling down for 44.5 seconds...
  Search page 22
  Cooling down for 40.9 seconds...
  Search page 23
  Cooling down for 31.9 seconds...
  Search page 24
  Cooling down for 34.3 seconds...
  Search page 25
  Cooling down for 30.4 seconds...
Saved 100 rows to batch_21_25.csv


Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,Gute Gestaltung 15 / Good Design 15,Deutscher Designer Deutscher Designer Cl...,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL25694520W/Gute...,2015,"de Gruyter GmbH, Walter",German,520,"Design, Graphic arts, Awards, Arts, germany"
1,Key images of American life: Pulitzer Prize wi...,Heinz Dietrich Fischer,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL22953077W/Key_...,2015,Lit,English,211,"Pulitzer Prizes, Awards, Photojournalism, Pict..."
2,Di er jie CAFAM wei lai zhan: chuang ke chuang...,"Xu, Bing, Huangsheng Wang, Chunchen Wang",1,https://covers.openlibrary.org/b/id/13192383-M...,https://openlibrary.org/works/OL22928325W/Di_e...,2015,Zhongguo qing nian chu ban she,Chinese,593,"Art, Awards, Exhibitions, Modern Art, Chinese ..."
3,"You, me & mobile: stories from South Asian cou...",Osama Manzar,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL22897954W/You_...,2015,Inomy Media,English,110,"Information technology, Awards, Case studies, ..."
4,Der Wettbewerb zur Wiener Ringstrasse: Entsteh...,Harald R. Stühlinger,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL22958419W/Der_...,2015,"Birkhäuser, ein Unternehmen der Walter de Gruy...",German,395,"Competitions, City planning, Buildings, struct..."


### Combine all batch CSVs into one DataFrame

In [30]:
files = [
    "batch_01_05.csv",
    "batch_06_10.csv",
    "batch_11_15.csv",
    "batch_16_20.csv",
    "batch_21_25.csv",
]

dfs = [pd.read_csv(f) for f in files]
df_final = pd.concat(dfs, ignore_index=True).drop_duplicates(subset="book_url").reset_index(drop=True)

print("Final rows:", len(df_final))
df_final.head()

Final rows: 500


Unnamed: 0,title,author,edition_count,cover_url,book_url,publish_date,publisher,language,pages,subjects
0,Politics of Literary Prestige: Prizes and Span...,Sarah E. L. Bowskill,2,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL25809312W/Poli...,2022,Bloomsbury Publishing Plc,English,240.0,"Spanish American literature, Awards, Political..."
1,Symposium Städtebau revisited: Preise - Praxis...,Christina Simon-Philipp,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32038765W/Symp...,2022,Wasmuth & Zohlen,German,135.0,"City planning, Congresses, Awards, German Arch..."
2,NTAA 2022 - New Technological Art Award 2022: ...,Haseeb Ahmed,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL32026243W/NTAA...,2022,Artha - art & heritage books,English,127.0,"Art and technology, Exhibitions, Art, Awards, ..."
3,"Deutsche Bank ""Artists of the Year"" 2021: Maxw...",Maxwell Alexandre,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31912905W/Deut...,2022,Kerber Verlag,English,227.0,"Modern Art, Exhibitions, Art, Awards, Maxwell ..."
4,"MAXXI Bulgari Prize 2022: Alessandra Ferrini, ...",Giulia Ferracci,1,https://openlibrary.org/images/icons/avatar_bo...,https://openlibrary.org/works/OL31884256W/MAXX...,2022,Corraini edizioni,Italian,163.0,"Maxxi Bulgari Prize, Exhibitions, Art, Awards,..."


### Save final dataset

In [31]:
df_final.to_csv("BRS_Web-Scrapping.csv", index=False)