<a href="https://colab.research.google.com/github/ancestor9/2025_Spring_Data-Management/blob/main/week_07/solution_webcrwal_text_mining_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **실습과제 해답**

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
from urllib.parse import urljoin
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:

BASE_URL = "https://books.toscrape.com/"
CATEGORY_NAMES = [
    'Travel', 'Mystery', 'Historical Fiction', 'Sequential Art', 'Classics',
    'Philosophy', 'Romance', 'Womens Fiction', 'Fiction', 'Childrens', 'Religion'
]

stop_words = set(stopwords.words('english'))


In [3]:

def get_category_urls():
    res = requests.get(BASE_URL)
    soup = BeautifulSoup(res.text, 'html.parser')
    categories = soup.select('.side_categories ul li ul li a')
    category_dict = {}
    for cat in categories:
        name = cat.text.strip()
        if name in CATEGORY_NAMES:
            href = cat['href']
            category_dict[name] = urljoin(BASE_URL, href)
    return category_dict

def get_book_urls(category_url):
    book_urls = []
    while category_url:
        res = requests.get(category_url)
        soup = BeautifulSoup(res.text, 'html.parser')
        books = soup.select('h3 a')
        for book in books:
            href = book['href'].replace('../../../', '')
            book_urls.append(urljoin(BASE_URL, 'catalogue/' + href))
        next_page = soup.select_one('li.next a')
        if next_page:
            category_url = urljoin(category_url, next_page['href'])
        else:
            break
    return book_urls

def get_book_text(book_url):
    res = requests.get(book_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    title = soup.select_one('h1').text
    description_tag = soup.select_one('#product_description ~ p')
    description = description_tag.text if description_tag else ''
    return title + ' ' + description

def clean_text(text):
    words = re.findall(r'\b[a-z]{3,}\b', text.lower())  # 3자 이상 알파벳 단어
    return [word for word in words if word not in stop_words]

def analyze_category(category_name, category_url):
    print(f"Processing {category_name}...")
    book_urls = get_book_urls(category_url)
    word_counter = Counter()
    for book_url in book_urls:
        raw_text = get_book_text(book_url)
        tokens = clean_text(raw_text)
        word_counter.update(tokens)
    return word_counter.most_common(10)



In [4]:
# 전체 실행
category_urls = get_category_urls()
results = {}

for name, url in category_urls.items():
    results[name] = analyze_category(name, url)

# DataFrame 출력
df = pd.DataFrame({
    genre: [f"{word} ({count})" for word, count in words]
    for genre, words in results.items()
})
df

Processing Travel...
Processing Mystery...
Processing Historical Fiction...
Processing Sequential Art...
Processing Classics...
Processing Philosophy...
Processing Romance...
Processing Womens Fiction...
Processing Fiction...
Processing Childrens...
Processing Religion...


Unnamed: 0,Travel,Mystery,Historical Fiction,Sequential Art,Classics,Philosophy,Romance,Womens Fiction,Fiction,Childrens,Religion
0,travel (24),one (35),love (30),new (83),one (19),one (17),new (60),love (17),one (69),book (29),book (22)
1,world (15),new (35),life (28),one (59),story (19),also (14),one (49),new (16),new (66),new (28),god (16)
2,trip (11),detective (32),world (28),vol (46),love (17),life (14),love (44),one (16),life (63),world (22),spiritual (13)
3,life (9),dark (29),new (25),life (35),man (13),translation (13),life (32),must (14),love (59),time (22),smith (9)
4,long (9),murder (28),two (18),school (30),little (10),christian (12),two (21),something (14),novel (54),little (20),one (9)
5,one (9),strike (27),one (18),story (30),world (10),thought (10),years (20),life (13),family (42),one (18),mormon (9)
6,book (9),mystery (23),book (16),city (30),work (9),human (10),family (20),bridget (13),world (37),bear (18),nones (8)
7,national (8),girl (22),novel (16),series (30),young (8),make (10),york (18),like (12),young (36),story (16),jerk (7)
8,new (8),man (21),century (16),dream (30),english (8),good (10),heart (17),world (12),old (32),red (16),true (7)
9,bill (8),case (20),story (15),death (29),catherine (8),world (9),wants (16),rachel (12),years (32),day (16),written (7)
