In [30]:
!pip install requests beautifulsoup4

You should consider upgrading via the '/Users/pengyuanlong/.pyenv/versions/3.10.4/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [31]:
import requests
from bs4 import BeautifulSoup
import os
import json

In [41]:
def fetch_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

def parse_data(html):
    soup = BeautifulSoup(html, 'html.parser')
    review_data = []

    try:
        book_title = soup.find('h1', itemprop='name').get_text(strip=True)
    except AttributeError:
        book_title = "Unknown"

    try:
        author = soup.find('span', itemprop='name').get_text(strip=True)
    except AttributeError:
        author = "Unknown"

    reviews = soup.find_all('div', class_='post_con')
    for review in reviews:
        try:
            review_text = review.find('div', class_='cri_corps_critique').get_text(strip=True)
        except AttributeError:
            review_text = "No review available."

        try:
            rating = float(review.find('div', class_='rateit')['data-rateit-value'])
        except (TypeError, ValueError, AttributeError):
            rating = None

        if rating is not None: 
            label = assign_label(rating)
            review_data.append([book_title, author, review_text, rating, label])

    return review_data

def assign_label(rating):
    if rating is None:
        return 0 
    elif rating >= 4.0:
        return 1 
    elif rating <= 2.5:
        return -1 
    else:
        return 0 

In [42]:
def save_data(data, filename, folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    path = os.path.join(folder, filename)
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)
    
    print(f"Data saved to {path}")

In [44]:
def main():
    urls = [
        'https://www.babelio.com/livres/Rousselet-La-belle-histoire-des-maths/121795',  # 示例 URL
        'https://www.babelio.com/livres/Saint-Exupery-Le-Petit-Prince/36712',  # 示例 URL
        'https://www.babelio.com/livres/Hugo-Les-Miserables/1329123',  # 示例 URL
        'https://www.babelio.com/livres/Duras-Lamant/5772',  # 示例 URL
        'https://www.babelio.com/livres/Camus-Letranger/3874',  # 示例 URL
        'https://www.babelio.com/livres/Hugo-Les-Travailleurs-de-la-mer/7119',
        'https://www.babelio.com/livres/Camus-La-Peste/313209',
        'https://www.babelio.com/livres/Camus-La-Chute/3631',
        'https://www.babelio.com/livres/Proust-A-la-recherche-du-temps-perdu-tome-1--Du-cote-de-/822463',
        'https://www.babelio.com/livres/Stendhal-Oeuvres-romanesques-completes-tome-1/99891',
        'https://www.babelio.com/livres/Balzac-La-comedie-humaine-La-Pleiade-tome-1/711990',
        'https://www.babelio.com/livres/Hugo-Quatrevingt-Treize/7115',
        'https://www.babelio.com/livres/Hugo-LHomme-qui-rit/3295',
        'https://www.babelio.com/livres/Stendhal-Le-Rouge-et-le-Noir/2908',
        'https://www.babelio.com/livres/Dumas-Les-Trois-Mousquetaires/1397512',
        'https://www.babelio.com/livres/Balzac-Eugenie-Grandet/78277',
        'https://www.babelio.com/livres/Maupassant-Mademoiselle-Fifi-et-autres-nouvelles/731832',
        'https://www.babelio.com/livres/Verne-Le-tour-du-monde-en-80-jours/1361536',
        'https://www.babelio.com/livres/Verne-LIle-mysterieuse/9547',
        'https://www.babelio.com/livres/Verne-Vingt-mille-lieues-sous-les-mers/5760',
    ]

    all_reviews = []
    for url in urls:
        html = fetch_data(url)
        if html:
            data = parse_data(html)
            all_reviews.extend(data)

    save_data(all_reviews, 'reviews_with_ratings.json', 'data/raw')

if __name__ == "__main__":
    main()

Data saved to data/raw/reviews_with_ratings.json
