In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from user_agent import generate_user_agent
from tqdm import tqdm
import openpyxl
import re
from urllib.parse import quote
from datetime import datetime

In [2]:
try:
    wb = openpyxl.load_workbook(f'../data/crawling_all_drama_20230914.xlsx')
except:
    wb = openpyxl.Workbook()
w1 = wb.worksheets[0]

w1.cell(1, 2).value = 'drama_id'
w1.cell(1, 3).value = 'drama_name'
w1.cell(1, 4).value = 'kor_name'
w1.cell(1, 5).value = 'year'
w1.cell(1, 6).value = 'director'
w1.cell(1, 7).value = 'screenwriter'
w1.cell(1, 8).value = 'country'
w1.cell(1, 9).value = 'type'
w1.cell(1, 10).value = 'tot_eps'
w1.cell(1, 11).value = 'duration'
w1.cell(1, 12).value = 'start_dt'
w1.cell(1, 13).value = 'end_dt'
w1.cell(1, 14).value = 'aired_on'
w1.cell(1, 15).value = 'org_net'
w1.cell(1, 16).value = 'content_rt'
w1.cell(1, 17).value = 'synopsis'
w1.cell(1, 18).value = 'rank'
w1.cell(1, 19).value = 'pop'
w1.cell(1, 20).value = 'genres'
w1.cell(1, 21).value = 'watchers'
w1.cell(1, 22).value = 'score'
w1.cell(1, 23).value = 'evaluators'
wb.save(f'../data/crawling_all_drama_20230914.xlsx')

In [3]:
header = {'User-Agent': generate_user_agent()}

# 드라마 링크, ID 받아서 리턴하는 함수
def get_dramas(page):
    tmp_id = []
    tmp_href = []

    url = "https://mydramalist.com/search?adv=titles&ty=68&co=3&re=2015,2023&st=3&so=relevance&page=" + str(page)
    req = Request(url, headers=header)
    res = urlopen(req)
    soup = BeautifulSoup(res.read(), 'html.parser')

    drama_box = soup.find("div", "m-t nav-active-border b-primary")
    dramas = drama_box.findAll("div")
    for drama in dramas:
        try:
            drama_id = re.search("mdl-\d+", drama['id']).group().split("-")[1]
            if drama_id != None:
                tmp_id.append(drama_id)
                drama_href = drama.find('div', "col-xs-9 row-cell content").find('a')['href']
                tmp_href.append(drama_href)
        except:
            continue

    return tmp_id, tmp_href

# 드라마 상세페이지에서 각종 정보 리턴하는 함수
def search_drama(idx, id, search_href):
    drama_id, drama_name, kor_name, year, director, screenwriter, country, type, tot_eps, duration\
        , start_dt, end_dt, aired_on, org_net, content_rt, synopsis, rank, pop, genres, watchers\
        , score, evaluators = [""] * 22
    type = "Drama"

    drama_url = "https://mydramalist.com" + search_href
    req = Request(drama_url, headers=header)
    res = urlopen(req)
    drama_soup = BeautifulSoup(res.read(), 'html.parser')

    side_soup = drama_soup.findAll("ul", "list m-b-0")

    # 이름, 나라, 에피소드수, 방영시작일, 종료일
    detail_soup = side_soup[0].findAll("li")
    drama_name = detail_soup[0].find("span").text
    country = detail_soup[1].text.split(":")[1].strip()
    tot_eps = int(detail_soup[2].text.split(":")[1].strip())
    aired_dt = detail_soup[3].text.split(":")[1].strip().split("-")
    if len(aired_dt) ==2:
        start_dt = aired_dt[0].strip()
        end_dt = aired_dt[1].strip()
    else:
        start_dt = aired_dt[0].strip()

    # 방영요일, 중계플랫폼, 플레이타임, 제한연령
    for li in detail_soup[4:]:
        tp = li.find("b").text
        if tp == "Aired On:":
            aired_on = li.text.split(":")[1].strip()
        if tp == "Original Network:":
            org_net = "[" + li.text.split(":")[1].strip() + "]"
        if tp == "Duration:":
            duration = li.text.split(":")[1].strip().replace("min.", "").split("hr.")
            if len(duration) == 2:
                duration = int(duration[0]) * 60 + int(duration[1])
            else:
                duration = int(duration[0])
            duration *= 60  # 초단위 변환
        if tp == "Content Rating:":
            content_rt = li.text.split(":")[1].strip()

    # 평점, 기여자수, 순위, 인기순위, 시청자수
    statistics_soup = side_soup[1].findAll("li")
    score = float(statistics_soup[0].text.split(":")[1][:4].strip())
    evaluators = int(re.search("\d*,?\d+", statistics_soup[0].find("span").text).group().replace(',', ''))
    rank = int(statistics_soup[1].text.split(":")[1].replace("#", ""))
    pop = int(statistics_soup[2].text.split(":")[1].replace("#", ""))
    watchers = int(statistics_soup[3].text.split(":")[1].replace(",", ""))

    # 한국명, 장르, 작가, 감독
    infos = drama_soup.find("div", "show-detailsxss")
    kor_name = infos.find("li", "list-item p-a-0").text.split(":")[1].strip()
    try:
        genres = infos.find("li", "list-item p-a-0 show-genres").findAll("a")
    except:
        genres = []
    
    screenwriter = []
    director = []
    info_li = infos.findAll("li", "list-item p-a-0")
    for li in info_li[2:]:
        tp = li.find("b").text
        if tp == "Screenwriter:":
            for a in li.findAll("a"):
                screenwriter.append(a.text)
        if tp == "Director:":
            for a in li.findAll("a"):
                director.append(a.text)
        if tp == "Screenwriter & Director:":
            for a in li.findAll("a"):
                director.append(a.text)
                screenwriter.append(a.text)

    # 작가, 감독, 장르 리스트 전처리
    screenwriter = "[" + ",".join(screenwriter) + "]"
    director = "[" + ",".join(director) + "]"
    
    genre_list = []
    for genre in genres:
        genre_list.append(genre.text)
    genre_list = ", ".join(genre_list)
    
    # 시놉시스
    synopsis_soup = drama_soup.find("div", "show-synopsis").findAll("span")
    for s in synopsis_soup:
        synopsis += s.text

    # 방영시작일, 종료일 전처리
    try:
        year = datetime.strptime(start_dt, "%b %d, %Y").year
    except:
        year = ""

    drama_id = id

    return [idx, drama_id, drama_name, kor_name, year, director, screenwriter, country, type, tot_eps
            , duration, start_dt, end_dt, aired_on, org_net, content_rt, synopsis, rank, pop
            , genre_list, watchers, score, evaluators]

# 엑셀 저장
def write_data(w1, datas):
    pos = datas[0]
    w1.cell(pos+1, 1).value = pos
    for i, data in enumerate(datas):
        w1.cell(pos+1, i+1).value = data

try:
    wb = openpyxl.load_workbook(f'../data/crawling_all_drama_20230914.xlsx')
except:
    wb = openpyxl.Workbook()
w1 = wb.worksheets[0]

cnt = 1
for i in tqdm(range(1, 93)):
    tmp_id, tmp_href = get_dramas(i)

    for id, href in zip(tmp_id, tmp_href):
        if w1.cell(cnt+1, 1).value == None:
            datas = search_drama(cnt, id, href)
            write_data(w1, datas)
            wb.save(f'../data/crawling_all_drama_20230914.xlsx')
        cnt += 1
        # time.sleep(0.5)

 92%|█████████▏| 85/92 [01:40<00:08,  1.18s/it]


ValueError: could not convert string to float: 'N/A'

# 스코어 없는 것부터는 데이터 의미 없다고 생각 -> 크롤링 종료


# 아래는 테스트용

In [None]:
# def search_drama(idx, id, search_href):
#     drama_id, drama_name, kor_name, year, director, screenwriter, country, type, tot_eps, duration\
#         , start_dt, end_dt, aired_on, org_net, content_rt, synopsis, rank, pop, genres, watchers\
#         , score, evaluators = [""] * 22
#     type = "Drama"

#     drama_url = "https://mydramalist.com" + search_href
#     req = Request(drama_url, headers=header)
#     res = urlopen(req)
#     drama_soup = BeautifulSoup(res.read(), 'html.parser')

#     side_soup = drama_soup.findAll("ul", "list m-b-0")

#     detail_soup = side_soup[0].findAll("li")
#     drama_name = detail_soup[0].find("span").text
#     country = detail_soup[1].text.split(":")[1].strip()
#     tot_eps = int(detail_soup[2].text.split(":")[1].strip())
#     aired_dt = detail_soup[3].text.split(":")[1].strip().split("-")
#     if len(aired_dt) ==2:
#         start_dt = aired_dt[0].strip()
#         end_dt = aired_dt[1].strip()
#     else:
#         start_dt = aired_dt[0].strip()
#     aired_on = detail_soup[4].text.split(":")[1].strip()


#     for li in detail_soup[5:]:
#         tp = li.find("b").text
#         if tp == "Original Network:":
#             org_net = "[" + li.text.split(":")[1].strip() + "]"
#         if tp == "Duration:":
#             duration = li.text.split(":")[1].strip().replace("min.", "").split("hr.")
#             if len(duration) == 2:
#                 duration = int(duration[0]) * 60 + int(duration[1])
#             else:
#                 duration = int(duration[0])
#             duration *= 60
#         if tp == "Content Rating:":
#             content_rt = li.text.split(":")[1].strip()

#     statistics_soup = side_soup[1].findAll("li")
#     score = float(statistics_soup[0].text.split(":")[1][:4].strip())
#     evaluators = int(re.search("\d*,?\d+", statistics_soup[0].find("span").text).group().replace(',', ''))
#     rank = int(statistics_soup[1].text.split(":")[1].replace("#", ""))
#     pop = int(statistics_soup[2].text.split(":")[1].replace("#", ""))
#     watchers = int(statistics_soup[3].text.split(":")[1].replace(",", ""))


#     infos = drama_soup.find("div", "show-detailsxss")
#     kor_name = infos.find("li", "list-item p-a-0").text.split(":")[1].strip()
#     genres = infos.find("li", "list-item p-a-0 show-genres").findAll("a")

#     screenwriter = []
#     director = []
#     info_li = infos.findAll("li", "list-item p-a-0")
#     for li in info_li[2:]:
#         tp = li.find("b").text
#         if tp == "Screenwriter:":
#             for a in li.find("a"):
#                 screenwriter.append(a.text)
#         if tp == "Director:":
#             for a in li.find("a"):
#                 director.append(a.text)
#         if tp == "Screenwriter & Director:":
#             for a in li.find("a"):
#                 director.append(a.text)
#                 screenwriter.append(a.text)
#     screenwriter = "[" + ",".join(screenwriter) + "]"
#     director = "[" + ",".join(director) + "]"
    
#     genre_list = []
#     for genre in genres:
#         genre_list.append(genre.text)
#     genre_list = ", ".join(genre_list)

#     watchers = drama_soup.findAll("div", "box clear hidden-sm-down")[-1].findAll("li", "list-item p-a-0")[-2].text
#     watchers = watchers.split(":")[1].split(",")
#     watchers = int("".join(watchers))
    
#     synopsis_soup = drama_soup.find("div", "show-synopsis").findAll("span")
#     for s in synopsis_soup:
#         synopsis += s.text

#     year = datetime.strptime(start_dt, "%b %d, %Y").year

#     drama_id = id

#     return [idx, drama_id, drama_name, kor_name, year, director, screenwriter, country, type, tot_eps
#             , duration, start_dt, end_dt, aired_on, org_net, content_rt, synopsis, rank, pop
#             , genre_list, watchers, score, evaluators]

# print(search_drama(1, 40257, "/49231-move-to-heaven"))

[1, 40257, 'Move to Heaven', '무브 투 헤븐', 2021, '[Kim Sung Ho]', '[Yoon Ji Ryun]', 'South Korea', 'Drama', 10, 3120, 'May 14, 2021', '', 'Friday', '[Netflix]', '18+ Restricted (violence & profanity)', 'Han Geu Roo is an autistic 20-year-old. He works for his father’s business “Move To Heaven,” a company that specializes in crime scene cleanup, where they also collect and arrange items left by deceased people, and deliver them to the bereaved family.\n\nWhen Geu Roo\'s father dies, Geu Roo\'s guardianship passes to his uncle, ex-convict Cho Sang Gu, who is a martial arts fighter in underground matches. Per the father\'s will, Sang Gu must care for and work with Geu Roo in “Move To Heaven” for three months to gain full guardianship and claim the inheritance. Eying money, Sang Gu agrees to the conditions and moves in.\n\n(Source: MyDramaList)\n\n~~ Adapted from the nonfiction essay "Things Left Behind" by professional trauma cleaner Kim Sae Byul. Gu must care for and work with Geu Roo in “M