In [1]:
import requests
from bs4 import BeautifulSoup
import mysql.connector
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [None]:
conn = mysql.connector.connect(
    host = "*****",
    port = "****",
    user = "****",
    password = "****",
    database = "****"
)

cursor = conn.cursor(buffered=True)

In [8]:
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

comp_info_base_url = "https://www.saramin.co.kr/zf_user/company-info/view?csn="
comp_info_sal_base_url = "https://www.saramin.co.kr/zf_user/company-info/view-inner-salary?csn="
comp_info_fin_base_url = "https://www.saramin.co.kr/zf_user/company-info/view-inner-finance?csn="

cursor.execute("select comp_sid from company")
results = cursor.fetchall()

for comp_sid in results:
    comp_sid = comp_sid[0]

    try:
        # comp_info 페이지
        comp_info_url = comp_info_base_url + comp_sid
        print(comp_info_url)
        res = requests.get(comp_info_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(res.text, "html.parser")

        if "죄송합니다. 기업정보를 찾을수 없습니다." in soup.get_text():
            print("company info not found for {comp_sid}")
            continue
        else:
            # 복리후생 ul
            benefits = soup.find("ul", class_="list_welfare")
            if benefits:
                benefits = benefits.find_all("li", "cate_item")
                benefit_str = ",".join([benefit.string for benefit in benefits if benefit.string])
                print("복리후생:", benefit_str)
                cursor.execute("update company set benefits=%s where comp_sid=%s", (benefit_str, comp_sid))
            else:
                print("No benefits")

            # 인원 div
            worker_info = soup.find("div", "worker_info")
            if worker_info:
                en = worker_info.find("div", class_="col")
                if en:
                    employee_num = en.find("strong", class_="num")
                    employee_num = int(employee_num.string.replace(",",""))
                    cursor.execute("update company set employee_num=%s where comp_sid=%s", (employee_num,comp_sid))
                    print("사원수:", employee_num)
                else:
                    print("No employee number")
            else:
                print("No employee number")
    except requests.RequestException as e:
        print(f"Failed to fetch company info for {comp_sid}: {e}")
        continue
    
    try:
        # comp_info_sal 페이지
        comp_info_sal_url = comp_info_sal_base_url + comp_sid
        res = requests.get(comp_info_sal_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(res.text, "html.parser")

        # 평균연봉
        wrap_average = soup.find("div", class_="wrap_average")
        if wrap_average:
            avg_sal = wrap_average.find("p", class_="average_currency").contents[0].string
            avg_sal = int(avg_sal.replace(",","")) * 10000
            cursor.execute("update company set avg_salary=%s where comp_sid=%s", (avg_sal,comp_sid))
            print(avg_sal)
        else:
            print("No average salary")
    except requests.RequestException as e:
        print(f"Failed to fetch salary info for {comp_sid}: {e}")
        continue

    try:
        # comp_info_fin 페이지
        comp_info_fin_url = comp_info_fin_base_url + comp_sid
        res = requests.get(comp_info_fin_url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(res.text, "html.parser")

        # 2021-2023 매출액
        boxes = soup.find_all("div", class_="box_finance")
        for box in boxes:
            if "매출액" in (box.find("div", class_="area_title").get_text()):
                years = box.find_all("em", class_="tit_graph")
                for year in years:
                    if "2021" in year.string:
                        revenue = year.find_next_sibling("div").find("span", class_="txt_value").string
                        cursor.execute("update company set revenue2021=%s where comp_sid=%s", (revenue,comp_sid))
                        print(f"2021 매출액: {revenue}")
                    elif "2022" in year.string:
                        revenue = year.find_next_sibling("div").find("span", class_="txt_value").string
                        cursor.execute("update company set revenue2022=%s where comp_sid=%s", (revenue,comp_sid))
                        print(f"2022 매출액: {revenue}")
                    elif "2023" in year.string:
                        revenue = year.find_next_sibling("div").find("span", class_="txt_value").string
                        cursor.execute("update company set revenue2023=%s where comp_sid=%s", (revenue,comp_sid))
                        print(f"2023 매출액: {revenue}")
                break
    except requests.RequestException as e:
        print(f"Failed to fetch financial info for {comp_sid}: {e}")
        continue

https://www.saramin.co.kr/zf_user/company-info/view?csn=a1h4dXJUaWxOK2g3eUpsazAwYTB0Zz09
복리후생: 건강검진,직원대출제도,각종 경조사 지원,체력단련실운영,본인학자금,자녀학자금,퇴직연금,인센티브제,장기근속자 포상,우수사원포상,스톡옵션,퇴직금,야근수당,휴일(특근)수당,4대 보험,명절선물/귀향비,생일선물/파티,창립일행사,우수사원시상식,워크샵,신규 입사자 교육(OJT),직무능력향상교육,리더십 강화교육,자기계발비 지원,점심식사 제공,저녁식사 제공,사내동호회 운영,음료제공(차, 커피),우리사주제도,휴게실,회의실,공기청정기,카페테리아,전용 사옥,사내 정원,건물 내 경사로,휠체어용 난간,유도점자블록,장애인 화장실,장애인 전용주차장,유니폼지급,노트북,사원증,콘도/리조트 이용권,사무용품 지급,노조/노사협의회,자유복장,기숙사 운영,주차장제공,연차,여름휴가,경조휴가제,반차,근로자의 날 휴무,휴가비지원,산전 후 휴가,육아휴직,남성출산휴가,휴양시설 제공
사원수: 382
61330000
https://www.saramin.co.kr/zf_user/company-info/view?csn=a1M3bmpodFQrTDJQSDdZaStzemZvUT09
No benefits
사원수: 15
47080000
2021 매출액:  3억 3,870만원
2022 매출액:  5억 33만원
2023 매출액:  6억 8,212만원
https://www.saramin.co.kr/zf_user/company-info/view?csn=a1U0ajBCdVFDRVFPYzFEV1B1YVpWUT09
복리후생: 각종 경조사 지원,통신비 지원,퇴직연금,장기근속자 포상,우수사원포상,야근수당,휴일(특근)수당,4대 보험,우수사원시상식,플레이샵,저녁식사 제공,사우회(경조사회),음료제공(차, 커피),휴게실,회의실,노트북,사원증,사무용품 지급,자유복장,차량유류비지급,주차장제공,회사차량 있음,연차,경조휴가제,근로자의 날 휴무,포상휴가,남성출산휴가
사원수: 25
418600

In [9]:
conn.commit()
conn.close()