In [1]:
import pandas as pd
import requests
import re
import time
from concurrent.futures import ThreadPoolExecutor
import os
import json

In [2]:
class SearchContent:
    def __init__(self, title, author, isbn):
        self.title = title
        self.author = author
        self.isbn = isbn

        self.description = None
        self.subjects = []
        self.num_pages = None
        self.genres = []
        self.averageRating = None

    def manage_search(self):
        self._openlib_search()
        
        for url in [self._google_url_by_isbn(), self._google_url_by_title_author()]:
            if self._is_complete():
                break
            self._google_search(url)
        #print(self._data_status())
            

    def _openlib_search(self):
        url = f"https://openlibrary.org/search.json?title={self.title}&author={self.author}"
        response = requests.get(url)
        if response.status_code != 200:
            return

        docs = response.json().get('docs', [])
        keys = {'key': "/works/", 'lending_edition_s': '/books/'}

        for doc in docs:
            if self._is_complete():
                break

            for field, prefix in keys.items():
                work_key = doc.get(field)
                if not work_key:
                    continue

                match = re.search(r'(?:/works/|/books/)?([a-zA-Z0-9]+)', work_key)
                if not match:
                    continue

                key_id = match.group(1)
                work_url = f"https://openlibrary.org{prefix}{key_id}.json"
                self._get_work_data(work_url)
                self._get_entries_data(key_id)

    def _get_work_data(self, url):
        try:
            response = requests.get(url)
            if response.status_code != 200:
                return
            data = response.json()

            if self.description is None:
                desc = data.get('description')
                if isinstance(desc, dict):
                    self.description = desc.get("value")
                elif isinstance(desc, str):
                    self.description = desc

            if not self.subjects:
                self.subjects = data.get('subjects', [])
        except Exception as e:
            print(f"Error en _get_work_data: {e}")

    def _get_entries_data(self, work_key):
        try:
            editions_url = f"https://openlibrary.org/works/{work_key}/editions.json?limit=1"
            response = requests.get(editions_url)
            if response.status_code != 200:
                return

            entries = response.json().get('entries', [])
            if not entries:
                return

            edition = entries[0]

            if self.num_pages is None:
                self.num_pages = edition.get("number_of_pages")

            if not self.genres:
                self.genres = edition.get("genres", [])
        except Exception as e:
            print(f"Error en _get_entries_data: {e}")

    def _google_url_by_isbn(self):
        return f"https://www.googleapis.com/books/v1/volumes?q=isbn:{self.isbn}"

    def _google_url_by_title_author(self):
        return f"https://www.googleapis.com/books/v1/volumes?q=intitle:{self.title}+inauthor:{self.author}"

    def _google_search(self, url):
        try:
            response = requests.get(url)
            if response.status_code != 200:
                return

            items = response.json().get("items", [])
            if not items:
                return

            for i in range(len(items)):
                volume_info = items[i].get("volumeInfo", {})
                self._fill_missing_fields_from_google(volume_info)
        except Exception as e:
            print(f"Error en _google_search: {e}")

    def _fill_missing_fields_from_google(self, info):
        if self.description is None:
            self.description = info.get("description")
        if not self.subjects:
            self.subjects = info.get("categories", [])
        if self.num_pages is None:
            self.num_pages = info.get("pageCount")
        if not self.genres:
            self.genres = info.get("categories", [])
        if not self.averageRating:
            self.averageRating = info.get("averageRating")

    def _is_complete(self):
        return (
            self.description is not None and
            self.subjects and
            self.num_pages is not None and
            self.genres
        )

    def as_dict(self):
        return {
            "title": self.title,
            "author": self.author,
            "isbn": self.isbn,
            "description": self.description,
            "subjects": self.subjects,
            "num_pages": self.num_pages,
            "genres": self.genres,
            "avgRating" : self.averageRating
        }

    def _data_status(self):
        return f"Status of {self.title}: Desc -> {self.description != None } | NumP -> {self.num_pages != None} | Subj -> {self.subjects != []} | Genr -> {self.genres != []} | Rate -> {self.averageRating != None}" 

searcher = SearchContent("The Lost World: Jurassic Park", "Don McGregor", "1852868856")
searcher.manage_search()
data = searcher.as_dict()
print(data)


{'title': 'The Lost World: Jurassic Park', 'author': 'Don McGregor', 'isbn': '1852868856', 'description': 'Classic Jurassic Park Volume 6 reprints the comics adaptation of the sequel to Jurassic Park, The Lost World.', 'subjects': ['Dinosaurs'], 'num_pages': 96, 'genres': ['Dinosaurs'], 'avgRating': None}


In [3]:
# Ejemplo de uso
column_names = ["title", "author", "year", "rating", "isbn", "url"]
df = pd.read_csv("library_for_scrapping.csv", sep=",",on_bad_lines="skip",encoding='latin-1', names = column_names)
df.head()

df_test = df.head(100)

In [4]:
def worker(row):
    title, author, isbn = row['title'], row['author'], row['isbn']
    searcher = SearchContent(title, author, isbn)
    searcher.manage_search()
    return searcher.as_dict()

temp_dir = os.path.join(os.getcwd(),  "temp")

batch_size = 100
total_rows = len(df)
num_batches = (total_rows + batch_size - 1) // batch_size  # redondea hacia arriba
print(num_batches)

curr_batch = 0
for i in range(curr_batch, 300):
    timeinit = time.time()

    start = i * batch_size
    end = min(start + batch_size, total_rows)
    batch_df = df.iloc[start:end]

    with ThreadPoolExecutor(max_workers=50) as executor:
        results = list(executor.map(worker, [row for _, row in batch_df.iterrows()]))

    output_filename = os.path.join(temp_dir, f"output_{i+1}.json")
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    timend = time.time()
    print(f"Batch {i+1}/{num_batches}, filas {start} a {end-1} | Elapsed: {timend - timeinit:.2f} s")


1470
Batch 1/1470, filas 0 a 99 | Elapsed: 110.16 s
Batch 2/1470, filas 100 a 199 | Elapsed: 93.98 s
Batch 3/1470, filas 200 a 299 | Elapsed: 68.13 s
Batch 4/1470, filas 300 a 399 | Elapsed: 36.86 s
Batch 5/1470, filas 400 a 499 | Elapsed: 31.16 s
Batch 6/1470, filas 500 a 599 | Elapsed: 38.86 s
Batch 7/1470, filas 600 a 699 | Elapsed: 9.28 s
Batch 8/1470, filas 700 a 799 | Elapsed: 21.41 s
Batch 9/1470, filas 800 a 899 | Elapsed: 108.74 s
Batch 10/1470, filas 900 a 999 | Elapsed: 79.07 s
Batch 11/1470, filas 1000 a 1099 | Elapsed: 89.51 s
Batch 12/1470, filas 1100 a 1199 | Elapsed: 288.16 s
Batch 13/1470, filas 1200 a 1299 | Elapsed: 17.67 s
Batch 14/1470, filas 1300 a 1399 | Elapsed: 30.43 s
Batch 15/1470, filas 1400 a 1499 | Elapsed: 118.45 s
Batch 16/1470, filas 1500 a 1599 | Elapsed: 28.20 s
Batch 17/1470, filas 1600 a 1699 | Elapsed: 23.64 s
Batch 18/1470, filas 1700 a 1799 | Elapsed: 29.54 s
Batch 19/1470, filas 1800 a 1899 | Elapsed: 27.91 s
Batch 20/1470, filas 1900 a 1999 | E