## 숨고 데이터 크롤링

* <b> 목차 </b>
    1. 데이터 크롤링(bs,Selenium)
    2. 데이터 전처리
    3. 인사이트 분석 및 시각화

In [301]:
# 필요 라이브러리 import

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
from selenium.webdriver.chrome.options import Options

In [302]:
# selenium 속도향상위해 불필요한 옵션을 사용하지 않게하는 코드

options = Options()
prefs = {'profile.default_content_setting_values': {'cookies' : 2, 'images': 2, 
                                                    'plugins' : 2, 'popups': 2, 'geolocation': 2,
                                                    'notifications' : 2, 'auto_select_certificate': 2,
                                                    'fullscreen' : 2,
                                                    'mouselock' : 2, 'mixed_script': 2, 
                                                    'media_stream' : 2, 'media_stream_mic' : 2,
                                                    'media_stream_camera': 2, 'protocol_handlers' : 2,
                                                    'ppapi_broker' : 2, 'automatic_downloads': 2, 'midi_sysex' : 2,
                                                    'push_messaging' : 2, 'ssl_cert_decisions': 2, 'metro_switch_to_desktop' : 2,
                                                    'protected_media_identifier': 2, 'app_banner': 2, 'site_engagement' : 2,
                                                    'durable_storage' : 2}}

options.add_experimental_option('prefs', prefs) 
options.add_argument("start-maximized") 
options.add_argument("disable-infobars") 
options.add_argument("--disable-extensions")

> 숨고 사이트의 경우 무한 스크롤 기능이 구현되어 있다. 따라서 이를 제어하기 위해 "셀레니움" 사용 <br>

> 1차적으로 전체 유저의 고유 id를 파싱한 후 파싱한 id를 바탕으로 유저별 정보 파싱

In [303]:
import time

# 서비스 대분류별(8개 서비스) 유저명단을 보여주는 페이지에서 유저별 id 파싱
# 각 페이지별 무한 스크롤 javaScript 제어를 위한 셀레니움 기능 사용
def get_user_list(base_url):
    chromedriver = 'C:/selenium/chromedriver.exe' 
    driver = webdriver.Chrome(chromedriver) # 크롬드라이버 작동
    base_url = base_url
    
    driver.get(base_url)
    
    SCROLL_PAUSE_TIME = 10

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom                                                      
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)                                                
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight-50);")  
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height            
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:                                                
            break
        last_height = new_height
        
    html = driver.page_source
    soup = BeautifulSoup(html,'html.parser')
    userId_parsing = soup.select('div.list-item > a')
    
    user_id_list = []
    for user_id in userId_parsing:
        user_id_list.append(user_id['href'].split('/')[3].split('?')[0])
        
    driver.close()
    
    return user_id_list

In [304]:
# 8개의 서비스 대분류 페이지의 url
base_url_list = ['https://soomgo.com/search/pro/service/address/%EB%A0%88%EC%8A%A8/last_login',
                 'https://soomgo.com/search/pro/service/address/%ED%99%88-%EB%A6%AC%EB%B9%99/last_login',
                 'https://soomgo.com/search/pro/service/address/%EC%9D%B4%EB%B2%A4%ED%8A%B8/last_login',
                 'https://soomgo.com/search/pro/service/address/%EB%B9%84%EC%A6%88%EB%8B%88%EC%8A%A4/last_login',
                 'https://soomgo.com/search/pro/service/address/%EB%94%94%EC%9E%90%EC%9D%B8-%EA%B0%9C%EB%B0%9C/last_login',
                 'https://soomgo.com/search/pro/service/address/%EA%B1%B4%EA%B0%95-%EB%AF%B8%EC%9A%A9/last_login',
                 'https://soomgo.com/search/pro/service/address/%EC%95%8C%EB%B0%94/last_login',
                 'https://soomgo.com/search/pro/service/address/%EA%B8%B0%ED%83%80/last_login']

In [305]:
# 크롤링 함수 실행
tmp_user_list = []
for url in base_url_list:
    tmp_user_list.append(get_user_list(url))

In [314]:
# 한 유저가 여러 대분류의 서비스 제공 목록을 가지는 경우가 존재한다.
# 따라서 중복된 user_id가 발생하므로 이를 제거
res_user_id_list = sum(tmp_user_list,[])
res_user_id_list= list(set(res_user_id_list))

> 전체 유저의 유저별 고유id를 통해 유저별 페이지에서 정보를 크롤링 한다. 

> Json 형식으로 저장

In [413]:
tmp_data = []
tmp_dict = {}

def main_parsing(user_id_list):
    
    global tmp_data
    global tmp_dict
    
    for user_id in user_id_list:
        print(user_id)
        res = requests.get("https://soomgo.com/profile/users/" + user_id + "?from=%EC%A7%80%EC%A0%95%EC%9A%94%EC%B2%AD%EC%84%9C")
        soup = BeautifulSoup(res.content, 'html.parser')
        
        categories = soup.select('div.view ul li')
        category = []
        for c in categories:
            category.append(c.text)
            
        hired_cnt = 0
        if soup.select_one('span.badge') == 'None' : hired_cnt = soup.select_one('li.hired').text.split("회")[0]
        else : hired_cnt = "신규고수"
            
        career = 0
        if soup.select_one('li.career') == None : career = 0
        else : career = soup.select_one('li.career').text.split(" ")[1][:-1]

        
        auth_business = ''
        if soup.select_one('li.auth-business') == None : auth_business = None
        else : auth_business = "사업자등록증 등록완료"
            
        auth_personal = ''
        if soup.select_one('li.auth-personal') == None : auth_personal = None
        else : auth_personal = "본인 인증"
        
            
        business_size = 0
        if soup.select_one('li.business-size') == None : business_size = None
        else : business_size = soup.select_one('li.business-size').text.split(" ")[1][:-1]
            
        payment = ''
        if soup.select_one('li.payment') == None : payment = None
        else : payment = soup.select_one('li.payment').text
        
        tmp_dict['user_id'] = user_id
        tmp_dict['category'] = category
        tmp_dict['grade'] = soup.select_one('span.point').text
        tmp_dict['hired_cnt'] = hired_cnt
        tmp_dict['review_cnt'] = soup.select_one('span.review_count').text.split("개")[0].split("(")[1]
        tmp_dict['address'] = soup.select_one('li.address').text
        tmp_dict['career'] = career
        tmp_dict['business_size'] = business_size
        tmp_dict['auth_business'] = auth_business
        tmp_dict['auth_personal'] = auth_personal
        tmp_dict['payment'] = payment
        
        tmp_data.append(tmp_dict)
        tmp_dict = {}
        
    return tmp_data

In [424]:
# res_data_1 = main_parsing(res_user_id_list[:5000])
# tmp_data = []

# res_data_2 = main_parsing(res_user_id_list[5000:10000])
# tmp_data = []

# res_data_3 = main_parsing(res_user_id_list[10000:])
# tmp_data = []

In [422]:
import json
from collections import OrderedDict

# Ready for data
# OrderDict => 딕셔너리 key 자동정렬 방지
to_json = OrderedDict() 

to_json['name'] = "유저별 정보" # 파일별 네이밍
to_json['version'] = "200310" # 날짜
to_json['data'] = res_data_3

with open('유저별정보_200310_3', 'w', encoding="utf-8") as make_file: 
    json.dump(to_json, make_file, ensure_ascii=False, indent="\t")

In [437]:
with open('유저별정보_200310_1',encoding='utf-8') as json_file:
    json_data_1 = json.load(json_file)
with open('유저별정보_200310_2',encoding='utf-8') as json_file:
    json_data_2 = json.load(json_file)
with open('유저별정보_200310_3',encoding='utf-8') as json_file:
    json_data_3 = json.load(json_file)