# 인터파크 일일 베스트셀러 책 순위 크롤링

In [2]:
import requests
from urllib.parse import quote
import pandas as pd
from bs4 import BeautifulSoup

### 1. 데이터 가져오기

In [3]:
site = 'http://book.interpark.com'         
url = f'{site}/display/collectlist.do?_method=bestsellerHourNew&bookblockname=b_gnb&booklinkname=%BA%A3%BD%BA%C6%AE%C1%B8&bid1=w_bgnb&bid2=LiveRanking&bid3=main&bid4=001#'
res = requests.get(url)
# html = res.text
#html

In [4]:
soup = BeautifulSoup(res.text, 'html.parser')

### 2. 찾으려는 데이터 찾기

In [5]:
lis = soup.select('.rankBestContentList > ol > li')
len(lis)

15

### 3. 하나의 데이터로 원하는 정보 추출

In [6]:
li = lis[0]
href = li.select_one('.coverImage').find('a')['href']
href

'/product/BookDisplay.do?_method=detail&sc.shopNo=0000400000&sc.prdNo=354069935&sc.saNo=003003001&bid1=Best_zone&bid2=LiveRanking&bid3=PRD&bid4=001'

In [7]:
title = li.select_one('.itemName').get_text().strip()
title

'트렌드 코리아 2022'

In [8]:
author = li.select_one('.author').get_text()
author

'김난도(金蘭都), 전미영, 최지혜, 이향은, 이준영'

In [9]:
company = li.select_one('.company').get_text()
company

'미래의창'

In [11]:
price = li.select_one('.price > em').get_text().strip()
price = int(price.replace(',',''))
price

16200

- 1 ~ 9위 랭크 찾기

In [12]:
li.select_one('.rankBtn_ctrl')

<span class="rankBtn_ctrl rkNum_B01"></span>

In [13]:
li.select_one('.rankBtn_ctrl')['class']

['rankBtn_ctrl', 'rkNum_B01']

In [14]:
li.select_one('.rankBtn_ctrl')['class'][1]

'rkNum_B01'

In [15]:
li.select_one('.rankBtn_ctrl')['class'][1][-1]

'1'

In [16]:
rank = li.select_one('.rankBtn_ctrl')['class']
rank = int(rank[1][-1:])
rank

1

- 10위 이상

In [17]:
li = lis[14]
li.select('.rankBtn_ctrl')

[<span class="rankBtn_ctrl rkNum_M01"></span>,
 <span class="rankBtn_ctrl rkNum_M04"></span>]

In [18]:
li.select('.rankBtn_ctrl')[0]['class']

['rankBtn_ctrl', 'rkNum_M01']

In [19]:
li.select('.rankBtn_ctrl')[0]['class'][1][-1]

'1'

In [20]:
li.select('.rankBtn_ctrl')[1]['class'][1][-1]

'4'

- 모든 경우에 대처

In [21]:
if len(li.select('.rankBtn_ctrl')) == 1:
    rank = li.select('.rankBtn_ctrl')[0]['class'][1][-1]
    rank = int(rank)

else:
    rank = li.select('.rankBtn_ctrl')[0]['class'][1][-1]
    rank += li.select('.rankBtn_ctrl')[1]['class'][1][-1]
    rank = int(rank)
rank

14

- sub page

In [22]:
url = site + href
res = requests.get(url)
# sub_html = sub_res.text
sub_soup = BeautifulSoup(res.text, 'html.parser')

In [23]:
sub_lis = sub_soup.select('ul.bInfo_txt > li')
# sub_lis

In [24]:
date = sub_lis[-3].get_text().split(':')[1].strip()
date

'2021년 10월 06일'

In [25]:
import re
date = re.sub('[^0-9]','',date)     # '^'은 not의 의미
date

'20211006'

In [26]:
page = sub_lis[-2].get_text().split(':')[1].strip()
page

'452'

### 10. DataFrame 만들기

In [27]:
lines = []
for li in lis:
    href = li.select_one('.coverImage').find('a')['href']
    title = li.select_one('.itemName').get_text().strip()
    author = li.select_one('.author').get_text()
    company = li.select_one('.company').get_text()
    price = li.select_one('.price > em').get_text().strip()
    price = int(price.replace(',',''))
    if len(li.select('.rankBtn_ctrl')) == 1:
        rank = li.select('.rankBtn_ctrl')[0]['class'][1][-1]
        rank = int(rank)
    else:
        rank = li.select('.rankBtn_ctrl')[0]['class'][1][-1]
        rank += li.select('.rankBtn_ctrl')[1]['class'][1][-1]
        rank = int(rank)

    url = site + href
    res = requests.get(url)
    sub_soup = BeautifulSoup(res.text, 'html.parser')
    sub_lis = sub_soup.select('ul.bInfo_txt > li')
    date = sub_lis[-3].get_text().split(':')[1].strip()
    date = re.sub('[^0-9]','',date)
    page = int(sub_lis[-2].get_text().split(':')[1].strip())

    lines.append([rank, title, author, company, price, date[2:], page])
    

In [28]:
df = pd.DataFrame(lines, columns=['No','제목','저자','출판사','가격','발행일','쪽수'])
df

Unnamed: 0,No,제목,저자,출판사,가격,발행일,쪽수
0,1,트렌드 코리아 2022,"김난도(金蘭都), 전미영, 최지혜, 이향은, 이준영",미래의창,16200,211006,452
1,2,불편한 편의점(15만부 기념 윈터 에디션),김호연,나무옆의자,12600,210420,268
2,3,오은영의 화해,오은영,코리아닷컴,14400,190110,320
3,4,아직 오지 않은 날들을 위하여,파스칼 브뤼크네르/이세진 역,인플루엔셜,14400,211112,320
4,5,달러구트 꿈 백화점,이미예,팩토리나인,12420,200708,300
5,6,오십에 읽는 논어,최종엽,유노북스,14400,211103,300
6,6,거꾸로 읽는 세계사,유시민(柳時敏),돌베개,15750,211029,404
7,8,달러구트 꿈 백화점. 2,이미예,팩토리나인,12420,210727,308
8,9,웰씽킹(Wealthinking),켈리 최,다산북스,14400,211110,316
9,9,어떻게 말해줘야 할까,오은영/차상미 그림,김영사,15750,201025,400


In [29]:
df.to_csv('인터파크베스트셀러순위_211207.csv', index=False)