# 네이버 영화 크롤링
- 사용자 영화 추천 시스템을 만들기 위한 Raw Movie Data 크롤링 소스
- 작성 일시: 2018-06-07
- 수정 일시: 2018-06-07
- 작성자: 부현경 (hyunkyung.boo@gmail.com)

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import mysql.connector
import logging
import time

In [2]:
# 수집된 데이터를 데이터 프레임으로 만들고 저장한다.
def save_to_excel(df, save_path):
    writer = pd.ExcelWriter(save_path)
    df.to_excel(writer, 'Sheet1', index=False, header=True)
    writer.save()
    
# 사용자가 평가한 영화 url 수집
def get_movie_link(url):
    req = requests.get(url)
    content = req.text
    soup = BeautifulSoup(content, 'html.parser')
    movie_url = soup.select('a[href]')
    movie_urls = []

    for link in movie_url:
        if re.search(r'st=mcode&sword' and r'&target=after$', link['href']):
            target_url = 'http://movie.naver.com/movie/point/af/list.nhn' + str(link['href'])
            movie_urls.append(target_url)
    return movie_urls


# 영화 정보 수집
def get_detailed_movie_info(url):
    movie_urls = get_movie_link(url)
    movie_detailed = []
    specific_movie_detailed = []
    movie_title_list = []
    
   

    for movie_url in movie_urls:
        req = requests.get(movie_url)
        content = req.text
        soup = BeautifulSoup(content, 'html.parser')
        # movie_title = soup.select("#old_content > div.choice_movie_box > div.choice_movie_info > h5 > a")
        movie_info = soup.select('#old_content > div.choice_movie_box > div.choice_movie_info > table > tr > td')
        
# 영화 제목도 같이 수집한 후 사용자가 작성한 영화 제목을 이용해 데이터 프레임을 합칠수도 있음        
#       for m in movie_title:
#           m = re.sub('<.*?>', " ", str(m))
#           m = re.sub('(\s){2,}', "", m)
#           movie_title_list.append(m)
        
        for m_info in movie_info:
            m_info = re.sub('<.*?>', " ", str(m_info))
            m_info = re.sub('(\s){2,}', "", m_info)
            # print("splited", m_info)
            if m_info != '':
                movie_detailed.append(m_info)
                # print("m", m_info)
    
    buff_list = []
    buff_list2 = []
    
    cnt = 0
    for buff in movie_detailed:
        buff_list.append(buff)
        cnt += 1
        if(cnt == 4):
            cnt = 0
            buff_list2.append(buff_list)
            buff_list = []
    movie_detailed = buff_list2
    # print("movie_detailed",movie_detailed)
    
    t_cnt = 0
    m_cnt = 0
    
    # 형식이 영화마다 다를 수 있어서 아래 형식에 맞지 않을 경우 공백으로 처리한다.
    useless = ["", "", "", "", "", ""]
    for i in movie_detailed:
        b = []
        for ii in i[0].split("|"):
            b.append(ii)
        b.append(i[1])
        b.append(i[2])
        b.append(i[3].split("/")[0])
        if(len(b) == 6):
            specific_movie_detailed.append(b)
        else:
            specific_movie_detailed.append(useless)
            pass
    
    movie_info_df = pd.DataFrame(specific_movie_detailed[0:], \
                                 columns=['genre','running_time','release date','director','actor/actress','movie_rating'])
    return movie_info_df


# 사용자의 페이지 링크를 수집한다.
# 최대 10개의 페이자 링크만 수집된다. 그 이상을 원할시 수정 필요.
def get_user_page(url):
    req = requests.get(url)
    content = req.text
    soup = BeautifulSoup(content, 'html.parser')

    # a 태그이면서 href 속성을 갖는 경우 탐색
    user_page = soup.select('a[href]')
    user_page_urls = []

    for link in user_page:
        if re.search(r'st=nickname&sword' and r'page', link['href']):
            target_url = 'http://movie.naver.com' + str(link['href'])
            user_page_urls.append(target_url)
            # print('1. target_url: ', target_url)
            
    # print('2. user_page_urls: ', user_page_urls)
    # '다음' 때문에 한페이지가 더 추가되어 처리
    if len(user_page_urls) != 1:
        pop_number = len(user_page_urls) - 1
        if pop_number > -1:
            user_page_urls.pop(pop_number)
        
    # print(user_page_urls)
    
    return user_page_urls

In [3]:
# 위에서 설정한 함수를 이용해 사용자 정보와 영화 정보를 크롤링하는 함수이다.
def do_crawl(url):
    
    num = 0
    new_df = pd.DataFrame()
    urls = get_user_page(url)
    # 평점을 10개 이하로 준 유저는 제외한다
    
    if len(urls) >= 2:
        for user_url in urls:
            req = requests.get(user_url)
            # print(req)
            content = req.text
            # print(content)

            # print(user_url, "에 대한 크롤링 시작!")
            soup = BeautifulSoup(content, 'html.parser')

            detailed_movie_info = get_detailed_movie_info(user_url)

            user_post_id = soup.find_all('td', class_='ac num')
            user_id = soup.find_all('a', class_='author')
            title = soup.find_all('td', class_='title')  # a 태그
            review = soup.find_all('td', class_='title')
            # print(review)
            score = soup.find_all('td', class_='point')
            # print(score)
            postDate = soup.find_all('td', class_='num')  # a 태그

            user_post_id_list = []
            for u_p_id in user_post_id:
                # print(u_p_id.get_text())
                user_post_id_list.append(u_p_id.get_text())

            user_id_list = []
            for u_id in user_id:
                # print(u_id.get_text())
                user_id_list.append(u_id.get_text())

            title_list = []
            for t in title:
                # print(t.a.get_text())
                title_list.append(t.a.get_text())

            review_list = []
            for r in review:
                # print("before ",r)
                r = re.sub('<td class=(.*?)>|</td>', " ", str(r))
                r = re.sub('<br>|<br/>', " ", r)
                r = re.sub('<.*?>(.*?)<.*?>', " ", r)
                r = re.sub('(\s){2,}', "", r)
                # print(r)
                review_list.append(r)

            score_list = []
            for s in score:
                s = re.sub('<td class=(.*?)>|</td>', " ", str(s))
                # print(s)
                score_list.append(s)

            postDate_list = []
            for p in postDate:
                # print("before: ", p)
                p = re.sub('<td class="ac num">(.*?)<.*?>'," ", str(p))
                p = re.sub('<td class=(.*?)>|</td>|<br/>', " ", p)
                p = re.sub('<.*?>(.*?)<.*?>', " ", p)
                p = re.sub('(\s){2,}', "", p)
                postDate_list.append(p)

            postDate_list = [d.replace(" ", "") for d in postDate_list if d != " "]
            # print(postDate_list)

            df = pd.DataFrame()
            df['user_post_id'] = user_post_id_list
            df['user_id'] = user_id_list
            df['movie'] = title_list
            df['review'] = review_list
            df['user movie raitng'] = score_list
            df['date of posting'] = postDate_list

            df = pd.concat([df, detailed_movie_info], axis = 1, ignore_index=True)
            print(user_url, " 크롤링 완료")
            user_df = pd.DataFrame(df)
            # user_df.append(df, ignore_index=True)
            num += 1
            # print(num, user_df)
            new_df = pd.concat([new_df, user_df], axis = 0, ignore_index=True)
        # print("---- 전체", url, " 크롤링 완료 ---- ")
        # print(url, "에 대한 전체 결과: ", all_df)
        # print(all_df)  

    return new_df

In [4]:
# start > end 여야 한다.
def getData(start, end):

    total_crawling = []
    
    # 네이버 영화 평점 페이지에 등록된 번호
    # 사용자에 따라 수정 가능 함
    page = start
    while page > end - 1:
        page = str(page)
        #{}안에 페이지 값이 삽입
        url = "http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword={}&target=after".format(page)

        # 크롤링 함수를 불러 객체로 저장한다.
        result = do_crawl(url)
        total_crawling.append(result)
        # print(result)

        page = int(page)
        page -= 1
    
    result_crawling = pd.concat(total_crawling, ignore_index=True)
    
    return result_crawling

In [6]:
# start, end page 숫자를 넣어주세요. 
# 중간 오류 저장 실패를 방지하기 위해 3번으로 분할
crawling_page_list = [[14180480, 14180464], [14180464, 14180400], [14180400, 14180350]]
num = 0

for pages in crawling_page_list:
    num += 1
    save_path = "D:\\naver_movie_{}_second_version.xlsx".format(num)
    save_to_excel(getData(pages[0], pages[1]), save_path)
    print("진짜 저장? 저장 위치는: ", save_path)

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180477&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180477&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180477&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180477&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180477&target=after&page=5  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180470&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180470&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180470&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180470&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180468&target=a

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180436&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180436&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=5  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=6  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=after&page=7  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180435&target=

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=5  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=6  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=7  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=8  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180395&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180389&target=

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180372&target=after&page=6  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180372&target=after&page=7  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180372&target=after&page=8  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180372&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180370&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180370&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180369&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180369&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180369&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180369&target=a

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180348&target=after&page=6  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180348&target=after&page=7  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180348&target=after&page=8  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180348&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180348&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180344&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180344&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180341&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180341&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180341&target=

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180302&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180300&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180300&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180300&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180300&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180298&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180298&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180298&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180298&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180292&target=

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180265&target=after&page=8  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180265&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180265&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180264&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180264&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180263&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180263&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180263&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180263&target=after&page=4  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180263&target=

http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=5  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=6  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=7  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=8  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=9  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180238&target=after&page=10  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180237&target=after&page=1  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180237&target=after&page=2  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180237&target=after&page=3  크롤링 완료
http://movie.naver.com/movie/point/af/list.nhn?st=nickname&sword=14180237&target=

In [94]:
# 저장된 raw data 엑셀 파일을 불러오는 방법
data_df = pd.DataFrame()

num_list = [1,2,3]
for num in num_list:
    load_path = "D:\\naver_movie_{}_second_version.xlsx".format(num)
    # print(load_path)
    df = pd.read_excel(load_path, 'Sheet1')
    # data_df.append(df)
    data_df = pd.concat([data_df, df], axis = 0, ignore_index=True)
    
print(data_df.head(2))    

         0         1           2   \
0  14180477  love****        툼레이더   
1  14160223  love****  어쌔신: 더 비기닝   

                                                  3   4         5        6   \
0                                툼레이더 아니고 인디아나라라 입니다   6  18.06.05   액션 ,모험   
1  평점 넘 낮다 8점 정도지만 평점 높히려고 10 줌 평점 낮아서 안볼까 하다가 캐치...  10  18.05.28  액션 ,스릴러   

     7               8         9                        10    11  
0  118분  개봉2018 . 03.08   로아 우다우그                알리시아 비칸데르  6.81  
1  112분  개봉2017 . 12.07  마이클 쿠에스타  딜런 오브라이언 ,마이클 키튼 ,산나 라단  7.01  


In [95]:
# data_df의 열 이름이 0 ~ 11로 되어 있으므로 바꿔주겠습니다. 
data_df.rename(columns = {0: 'post_id', 1: 'user_id', 2: 'movie_title', 3: 'user_review', 
                          4: 'user_rating', 5: 'post_date', 6: 'movie_genre', 7: 'running_time', 
                          8: 'release_date', 9: 'director', 10: 'actor', 11: 'movie_rating'})

Unnamed: 0,post_id,user_id,movie_title,user_review,user_rating,post_date,movie_genre,running_time,release_date,director,actor,movie_rating
0,14180477,love****,툼레이더,툼레이더 아니고 인디아나라라 입니다,6,18.06.05,"액션 ,모험",118분,개봉2018 . 03.08,로아 우다우그,알리시아 비칸데르,6.81
1,14160223,love****,어쌔신: 더 비기닝,평점 넘 낮다 8점 정도지만 평점 높히려고 10 줌 평점 낮아서 안볼까 하다가 캐치...,10,18.05.28,"액션 ,스릴러",112분,개봉2017 . 12.07,마이클 쿠에스타,"딜런 오브라이언 ,마이클 키튼 ,산나 라단",7.01
2,14069333,love****,하우스 오브 데스,아놔 이거 귀신영화 아니었어? 아니아니 저여자가 귀신이었군 반전 대박쓰,6,18.05.01,"드라마 ,공포 ,스릴러",90분,개봉2018 . 01.25,애덤 쉰들러,"베스 리스그래프 ,로리 컬킨 ,잭 케시",5.86
3,14035620,love****,테이큰 비긴즈,재미없음 글고 아줌마 옷이 멀쩡했다가 찢어져있다가 또 피투성이였다가 옥의티 많음 ㅜ,4,18.04.25,"드라마 ,스릴러",93분,개봉2017 . 04.13,하워드 J. 포드,안젤라 딕슨,4.37
4,13971073,love****,레디 플레이어 원,간만에 굿~~! 잼게봐따,9,18.04.04,"액션 ,SF ,모험",140분,개봉2018 . 03.28,스티븐 스필버그,"마크 라이런스 ,사이먼 페그 ,올리비아 쿡",8.58
5,13614268,love****,지오스톰,평점 높혀야지 난 괜찮게봤음 근데 예상 가능하긴 함 스토리 전개 보면서 다맞힘 신랑...,10,18.01.05,"액션 ,SF ,스릴러",109분,개봉2017 . 10.19,딘 데블린,"제라드 버틀러 ,짐 스터게스 ,애비 코니쉬",7.46
6,13527973,love****,키드냅,와 속터져 지금 초중반인데 암걸릴거같애,8,17.12.23,"액션 ,스릴러",94분,개봉2017 . 11.22,루이스 프리에토,할리 베리,6.97
7,13368485,love****,그것,아니 애들이 뭔 연기를 이리 잘해!!!!,8,17.11.07,"공포 ,드라마",135분,개봉2017 . 09.06,안드레스 무시에티,"빌 스카스가드 ,제이든 리버허",6.98
8,13313680,love****,윈드 리버,잔잔하게 묵묵히 보다가 끝에서 혼자 열받아서 땀을 한바가지 흘리면서 봣다...,8,17.10.22,"서스펜스 ,액션 ,스릴러",111분,개봉2017 . 09.14,테일러 쉐리던,"제레미 레너 ,엘리자베스 올슨",8.47
9,13275303,love****,청년경찰,난 넘 잼게봣다 ㅋㅋㅋ 박서준 넘 좋아 근데 하늘이 연기가 더 웃겼음 ㅋㅋㅋㅋ,9,17.10.11,액션,109분,개봉2017 . 08.09,김주환,"박서준 ,강하늘",8.37
