In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# 종목별 비슷한 회사 묶음

In [2]:
html = urlopen(None)
bsObject = BeautifulSoup(html, "html.parser") 

classification_of_business = defaultdict(list)

path = None
for ref in bsObject.select_one('table').select('a'):
    if ref.text == '기타':
        continue
    link = urlopen(path + ref.get('href'))
    link_page = BeautifulSoup(link, "html.parser")
    for name in link_page.select('table tbody .name'):
        text = name.text
        if text.find('*') != -1:
            text = text[:-2]
        classification_of_business['name'].append(text)
        classification_of_business['business_category'].append(ref.text)

classification_of_business = pd.DataFrame(dict(classification_of_business))
    

In [21]:
classification_of_business

Unnamed: 0,name,business_category
0,루닛,건강관리기술
1,제이엘케이,건강관리기술
2,뷰노,건강관리기술
3,라이프시맨틱스,건강관리기술
4,비트컴퓨터,건강관리기술
...,...,...
2723,제일테크노스,건축제품
2724,금강공업우,건축제품
2725,덕신하우징,건축제품
2726,뉴보텍,건축제품


# 회사를 그룹별로 묶음

In [22]:
html = urlopen(None)
bsObject = BeautifulSoup(html, "html.parser") 

group_of_business = defaultdict(list)

path = None
for ref in bsObject.select_one('table').select('a'):
    link = urlopen(path + ref.get('href'))
    link_page = BeautifulSoup(link, "html.parser")
    for name in link_page.select('table tbody .name'):
        text = name.text
        if text.find('*') != -1:
            text = text[:-2]
        group_of_business['name'].append(text)
        group_of_business['group'].append(ref.text)

group_of_business = pd.DataFrame(dict(group_of_business))
    

In [23]:
group_of_business

Unnamed: 0,name,group
0,에코프로,에코프로
1,에코프로에이치엔,에코프로
2,에코프로비엠,에코프로
3,포스코인터내셔널,포스코
4,포스코엠텍,포스코
...,...,...
338,웅진씽크빅,웅진
339,미래에셋증권,미래에셋
340,미래에셋벤처투자,미래에셋
341,미래에셋생명,미래에셋


# 회사 테마별로 묶음

In [None]:
theme_of_business = defaultdict(list)
for page in range(1, 8):
    html = urlopen(None + str(page))
    bsObject = BeautifulSoup(html, "html.parser") 

    path = None
    for ref in tqdm(bsObject.select_one('table').select('a')):
        link = urlopen(path + ref.get('href'))
        link_page = BeautifulSoup(link, "html.parser")
        for name in link_page.select('table tbody .name'):
            text = name.text
            if text.find('*') != -1:
                text = text[:-2]
            theme_of_business['name'].append(text)
            theme_of_business['theme'].append(ref.text)
theme_of_business = pd.DataFrame(dict(theme_of_business))


In [29]:
theme_of_business

Unnamed: 0,name,theme
0,금양,리튬
1,코스모화학,리튬
2,강원에너지,리튬
3,이브이첨단소재,리튬
4,리튬포어스,리튬
...,...,...
6974,범양건영,모듈러주택
6975,GS건설,모듈러주택
6976,덕신하우징,모듈러주택
6977,자연과환경,모듈러주택


In [30]:
folder = None
classification_of_business.to_csv(folder + 'classification_of_business.csv', index=False)
group_of_business.to_csv(folder + 'group_of_business.csv', index=False)
theme_of_business.to_csv(folder + 'theme_of_business.csv', index=False)

# 멀티 프로세싱, 멀티 쓰레딩

In [16]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor
import time
import concurrent.futures
import requests
import os

def do_html_crawl(url: str, ref: str):
    theme_of_business = dict()
    link = requests.get(url)
    link_page = BeautifulSoup(link.text, "html.parser")
    for name in link_page.select('table tbody .name'):
        text = name.text
        if text.find('*') != -1:
            text = text[:-2]
        theme_of_business[text] = ref.text
    return theme_of_business

def do_process_crawl(urls: str):
    return do_thread_crawl(urls)

def do_thread_crawl(url: str):
    html = requests.get(url)
    bsObject = BeautifulSoup(html.text, "html.parser")
    refs = bsObject.select_one('table').select('a')
    thread_list = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        for ref in refs:
            path = None
            url = naver + ref.get('href')
            thread_list.append(executor.submit(do_html_crawl, url, ref))
        results = []
        for execution in concurrent.futures.as_completed(thread_list):
            result = execution.result()
            results.append(result)
    return results

start_time = time.time()
urls = [None for page in range(1, 8)]
results = []
with Pool(processes=7) as pool:
    results.append(pool.map(do_process_crawl, urls))
print(results)
print(f"elapsed time = {time.time() - start_time}")

[[[{}, {'비에이치아이': '원자력발전소 해체', '우진 ': '원자력발전소 해체', '한전기술 ': '원자력발전소 해체', '우리기술': '원자력발전소 해체', '두산에너빌리티 ': '원자력발전소 해체', '에이비프로바이오': '원자력발전소 해체', '오르비텍': '원자력발전소 해체', '휴림로봇': '원자력발전소 해체', '한전KPS ': '원자력발전소 해체', '에스앤더블류': '원자력발전소 해체', '현대건설 ': '원자력발전소 해체', '비츠로테크': '원자력발전소 해체', '휴비스 ': '원자력발전소 해체', '한국테크놀로지': '원자력발전소 해체', '대창솔루션': '원자력발전소 해체'}, {}, {}, {'한미반도체 ': 'LED장비', '이오테크닉스': 'LED장비', '프로텍': 'LED장비', '주성엔지니어링': 'LED장비', '레이저쎌': 'LED장비', '코디': 'LED장비', '예스티': 'LED장비', '기가레인': 'LED장비', '티씨케이': 'LED장비', '미래컴퍼니': 'LED장비', '네온테크': 'LED장비', '탑엔지니어링': 'LED장비', '엘아이에스': 'LED장비', '티에스이': 'LED장비'}, {}, {}, {}, {}, {'알에프세미': 'MLCC(적층세라믹콘덴서)', '아모텍': 'MLCC(적층세라믹콘덴서)', '윈텍': 'MLCC(적층세라믹콘덴서)', '삼성전기 ': 'MLCC(적층세라믹콘덴서)', '삼화콘덴서 ': 'MLCC(적층세라믹콘덴서)', '네온테크': 'MLCC(적층세라믹콘덴서)', '아바텍': 'MLCC(적층세라믹콘덴서)', '대주전자재료': 'MLCC(적층세라믹콘덴서)', '코스모신소재 ': 'MLCC(적층세라믹콘덴서)'}, {}, {'알에프세미': '지능형로봇/인공지능(AI)', '제이엘케이': '지능형로봇/인공지능(AI)', '뷰노': '지능형로봇/인공지능(AI)', '딥노이드': '지능형로봇/인공지능(AI)', '라온피플': '지능형로봇/인공지능(AI)', '엠로': '지능