In [2]:
import pandas as pd
import datetime
import pymysql
from sqlalchemy import create_engine
import sqlalchemy as db
import time
from bs4 import BeautifulSoup
import re
import requests
import sys

db_connection_str = 'mysql+pymysql://root:{password}@localhost/{database_name}'
engine = db.create_engine(db_connection_str, encoding = 'utf-8')
conn = engine.connect()

date = datetime.datetime.now().strftime("%Y-%m-%d")
print(date)

2023-01-06


In [3]:
# 버전 확인
print("python version:", sys.version)
print("pandas version:", pd.__version__)
print("pymysql version:", pymysql.__version__)
print("re version:", re.__version__)
print("requests version:", requests.__version__)

python version: 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 12:57:03) [MSC v.1916 32 bit (Intel)]
pandas version: 1.1.5
pymysql version: 1.0.2
re version: 2.2.1
requests version: 2.27.1


In [4]:
# 업종 크롤링

url = 'https://finance.naver.com/sise/sise_group.naver?type=upjong'
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')

upjong = []

for i in soup.select("a[href^='/sise/sise_group_detail']"):
    # print(str(i.attrs['href']), i.text)
    link = 'https://finance.naver.com' + str(i.attrs['href'])
    detail_response = requests.get(link)
    detail_html = detail_response.text
    detail_soup = BeautifulSoup(detail_html, 'html.parser')
    for c in detail_soup.select("td > div > a"):
        code = re.findall('(?=)[0-9]+', str(c.attrs['href']))
        upjong.append([date, code[0], str(i.text), str(c.text)])

In [5]:
upjong_df = pd.DataFrame(upjong, columns = ['last_update', 'upjong_name', 'code', 'company'])
upjong_df = upjong_df.drop_duplicates()   # 혹시 모를 중복 방지
upjong_df.head()

Unnamed: 0,last_update,upjong_name,code,company
0,2023-01-06,24120,자동차부품,KB오토시스
1,2023-01-06,53270,자동차부품,구영테크
2,2023-01-06,71850,자동차부품,캐스텍코리아
3,2023-01-06,45520,자동차부품,크린앤사이언스
4,2023-01-06,38110,자동차부품,에코플라스틱


In [6]:
upjong_df.to_sql(name = "company_upjong_info", con = conn, if_exists = 'replace', index = False)

In [7]:
pd.read_sql("select * from company_upjong_info limit 5", con = conn)

Unnamed: 0,last_update,upjong_name,code,company
0,2023-01-06,24120,자동차부품,KB오토시스
1,2023-01-06,53270,자동차부품,구영테크
2,2023-01-06,71850,자동차부품,캐스텍코리아
3,2023-01-06,45520,자동차부품,크린앤사이언스
4,2023-01-06,38110,자동차부품,에코플라스틱


In [8]:
theme = []

for p in range(1, 10, 1):
    url = 'https://finance.naver.com/sise/theme.naver?&page='+str(p)
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    for i in soup.select("a[href^='/sise/sise_group_detail']"):
        link = 'https://finance.naver.com' + str(i.attrs['href'])
        detail_response = requests.get(link)
        detail_html = detail_response.text
        detail_soup = BeautifulSoup(detail_html, 'html.parser')
        
        for c in detail_soup.select("td[class='name'] > div > a"):
            code = re.findall('(?=)[0-9]+', str(c.attrs['href']))
            theme.append([date, "A" + code[0], str(i.text), str(c.text)])

In [9]:
theme_df = pd.DataFrame(theme, columns = ['last_update', 'theme_name', 'code', 'company'])
theme_df = theme_df.drop_duplicates()   # 혹시 모를 중복 방지
theme_df.head()

Unnamed: 0,last_update,theme_name,code,company
0,2023-01-06,A339950,일자리(취업),아이비김영
1,2023-01-06,A133750,일자리(취업),메가엠디
2,2023-01-06,A143240,일자리(취업),사람인에이치알
3,2023-01-06,A376980,일자리(취업),원티드랩
4,2023-01-06,A241520,일자리(취업),DSC인베스트먼트


In [10]:
theme_df.to_sql(name = "company_theme_info", con = conn, if_exists = 'replace', index = False)

In [11]:
pd.read_sql("select * from company_theme_info limit 5", con = conn)

Unnamed: 0,last_update,theme_name,code,company
0,2023-01-06,A339950,일자리(취업),아이비김영
1,2023-01-06,A133750,일자리(취업),메가엠디
2,2023-01-06,A143240,일자리(취업),사람인에이치알
3,2023-01-06,A376980,일자리(취업),원티드랩
4,2023-01-06,A241520,일자리(취업),DSC인베스트먼트
