# Watcha Pedia 박스 오피스 영화 정보 수집하기

## 필요한 라이브러리 설치

In [None]:
!pip install lxml
!pip install requests
!pip install bs4
!pip install openpyxl
!pip install selenium

## 목표 URL 수집하기

In [None]:
import requests # HTTP request를 쉽게 구현해주는 라이브러리
watcha_url = "https://pedia.watcha.com/ko-KR/?domain=movie" # 크롤링 원하는 주소
response = requests.get(watcha_url)
response # 내가 원하는 것? https://docs.python-requests.org/en/latest/api/#requests.Response

In [None]:
from lxml import html # XML 또는 HTML을 Parsing해 주는 라이브러리
bo_urls = ["https://pedia.watcha.com"+i for i in html.fromstring(response.text).xpath('//*[@id="root"]/div/div[1]/section/div/section/div[1]/div[2]/div/div[1]/div/div/ul//li//a/@href')]
bo_urls

### [Xpath 더 알아보기](https://www.w3schools.com/xml/xpath_syntax.asp)

## 각 URL에서 원하는 정보 획득하기

In [None]:
bo_list = []
for bo_url in bo_urls:
    movie_page = requests.get(bo_url)
    movie_title = [i.text for i in html.fromstring(movie_page.text).xpath('//*[@id="root"]/div/div[1]/section/div/div[2]/div/section/div[2]/div/div/div/div/h1')][0]
    movie_detail = [i for i in html.fromstring(movie_page.text).xpath('//*[@id="root"]/div/div[1]/section/div/div[2]/div/section/div[2]/div/div/div/div/div[1]/text()')][0]
    movie_desc  = [i for i in html.fromstring(movie_page.text).xpath('//*[@id="root"]/div/div[1]/section/div/div[2]/div/div/div/div[1]/div[1]/div/div/section[2]/div[2]/div/article/div[2]/div/text()')][0]
    movie_rate  = [i for i in html.fromstring(movie_page.text).xpath('//*[@id="root"]/div/div[1]/section/div/div[2]/div/section/div[2]/div/div/div/div/div[2]/text()')][0]
    movie_poster = [i.get('src') for i in html.fromstring(movie_page.text).xpath('//*[@id="root"]/div/div[1]/section/div/div[2]/div/section/div[1]/div[2]/div/div/div/div[1]/img')][0]
    movie_dict = {"movie_title":movie_title, "movie_detail":movie_detail,
                  "movie_desc":movie_desc, "movie_rate":movie_rate, "movie_poster":movie_poster}
    bo_list.append(movie_dict)


In [None]:
bo_list

In [None]:
# 제대로 수집 됐는지 확인하기
from IPython.display import Image, display # Jupyter notebook에서 이미지를 보여주는 라이브러리
Image(url=bo_list[1]['movie_poster'])

## BeautifulSoup4으로 한다면?

In [None]:
from bs4 import BeautifulSoup # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
for bo_url in bo_urls:
    movie_page = requests.get(bo_url).text
    soup = BeautifulSoup(movie_page, 'html.parser')
    movie_title = soup.select_one('#root > div > div.css-1xm32e0 > section > div > div.css-1ihluk0-Content.e1ezac431 > div > section > div.css-1p7n6er-Pane.e1svyhwg15 > div > div > div > div > h1')
    movie_title = movie_title.get_text()
    print(movie_title)

## pandas DataFrame으로 만들기

In [None]:
import pandas as pd

In [None]:
box_office = pd.DataFrame(bo_list)
box_office

## movie_detail에서 년도, 장르, 국적 분리하기 

In [None]:
box_office['movie_year'] = box_office['movie_detail'].apply(lambda x: x.split(" ・ ")[0])
box_office['movie_genre'] = box_office['movie_detail'].apply(lambda x: x.split(" ・ ")[1])
box_office['movie_nation'] = box_office['movie_detail'].apply(lambda x: x.split(" ・ ")[2])
box_office = box_office.drop('movie_detail', axis=1)
box_office

## movie_rate에서 평점만 추출하기

In [None]:
box_office['movie_rate'] = box_office['movie_rate'].str.findall("\d+\.\d+") # 정규표현식. 언젠가 다룰 예정.
box_office

In [None]:
box_office['movie_rate'] = box_office['movie_rate'].apply(lambda x: x[0] if x != [] else None) # 삼항 연산자 설명
box_office

## 파일로 저장하기

In [None]:
box_office.to_csv("box_office.csv", index=False)

In [None]:
box_office.to_excel("box_office.xlsx", index=False)

## 파일 불러오기

In [None]:
bodf = pd.read_csv('box_office.csv')

## 폰트 설정하기

In [None]:
# 운영체제별 한글 폰트 설정
import platform
import matplotlib.pyplot as plt
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

## 영화별 평점 Bar plot 그리기

In [None]:
bodf.plot.bar(x='movie_title',y='movie_rate')

## 인스타그램은 어떻게 크롤링할까?

### Selenium chrome driver 설치하기 [Chrome driver 다운로드](https://sites.google.com/chromium.org/driver/)
#### Windows : C:\Windows 에 넣기
#### Mac : /usr/local/bin 에 넣기

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
chrome_options = webdriver.ChromeOptions()
# 브라우저에 임의로 User-agent 옵션을 넣어 Python 코드로 접속함을 숨김
chrome_options.add_argument('--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"')
driver = webdriver.Chrome(chrome_options=chrome_options)
URL = "https://www.instagram.com/"

  import sys


In [8]:
driver.get(URL)

In [9]:
import login_info

In [10]:
ID = login_info.ID
PW = login_info.PW
driver.find_element(By.NAME, "username").send_keys(ID)
driver.find_element(By.NAME, "password").send_keys(PW)

In [11]:
driver.find_element(By.CLASS_NAME, "L3NKy").click()

In [12]:
driver.find_element(By.CLASS_NAME, "L3NKy").click()

In [13]:
driver.find_element(By.CLASS_NAME, "aOOlW").click()

In [14]:
driver.get('https://www.instagram.com/'+login_info.ID)

In [None]:
for idx, img in enumerate([i.get_attribute('src') for i in driver.find_elements(By.XPATH,'//div[@class="KL4Bh"]/img')]):
    img_data = requests.get(img).content
    os.makedirs("./profile_images", exist_ok=True) 
    with open('profile_images/'+str(idx)+'.jpg', 'wb') as handler:
        handler.write(img_data)


In [None]:
driver.close()