In [5]:
"""
    HTML 내부에 있는 링크를 추출하는 함수
        - CSS 파일과 a 링크 연결된 모든 파일을 가져오기
"""

from bs4 import BeautifulSoup
from urllib import parse
from urllib import request

def enum_links(html,base):
    #-------------------------------------
    soup = BeautifulSoup(html,'html.parser')
    links = soup.select('a[href]')
    
    result = []
    for a in links:
        href = a.attrs['href']
        url = parse.urljoin(base,href) ## urljoin 사용
        result.append(url)
    return result


if __name__ == '__main__':
    url = 'https://docs.python.org/3.7/library/'
    response = request.urlopen(url)   # urllib.request.urlopen() : BeautifulSoup을 통해 html 파서할(데이타를 가져올) 대상
    result = enum_links(response, url)
    print(result)

https://docs.python.org/3.7/genindex.html
https://docs.python.org/3.7/py-modindex.html
https://docs.python.org/3.7/library/intro.html
https://docs.python.org/3.7/reference/grammar.html
https://www.python.org/
https://docs.python.org/3.7/index.html
https://docs.python.org/3.7/library/#the-python-standard-library
https://docs.python.org/3.7/reference/index.html#reference-index
https://pypi.org
https://docs.python.org/3.7/library/intro.html
https://docs.python.org/3.7/library/intro.html#notes-on-availability
https://docs.python.org/3.7/library/functions.html
https://docs.python.org/3.7/library/constants.html
https://docs.python.org/3.7/library/constants.html#constants-added-by-the-site-module
https://docs.python.org/3.7/library/stdtypes.html
https://docs.python.org/3.7/library/stdtypes.html#truth-value-testing
https://docs.python.org/3.7/library/stdtypes.html#boolean-operations-and-or-not
https://docs.python.org/3.7/library/stdtypes.html#comparisons
https://docs.python.org/3.7/library/std

In [13]:
"""
    파일을 다운받고 저장하는 함수

     [참고] 파이썬 정규식 표현 : https://wikidocs.net/4308
"""
from bs4 import BeautifulSoup
from urllib import parse
from urllib import request
import os, time, re  # re : 정규식

def download_file(url):
    p = parse.urlparse(url)
    print('1-',p)
    savepath = './' + p.netloc + p.path
    print('2-', savepath)
    
#   '/'로 끝나서 파일명이 없는 경우 index.html을 붙여줌  
    if re.search('/$',savepath):
        savepath += 'index.html'
        print('3-',savepath)
    
#   해당경로에 파일이 있으면 다운로드 받지 않고 리턴 
    if os.path.exists(savepath):
        return savepath
    
#   해당경로의 디렉토리가 없으면 디렉토리 생성
    savedir = os.path.dirname(savepath)
    if not os.path.exists(savedir):
        os.makedirs(savedir) # makedirs => 히위 한번에 생성
    
#   웹페이지 다운받기
    try:
        request.urlretrieve(url,savepath)
        time.sleep(1)
        return savepath
    except:
        print('download failed : ', url)
        return None
        
    
    
if __name__ == '__main__':
    url = 'https://docs.python.org/3.6/library/'
    result = download_file(url)
    print(result)





1- ParseResult(scheme='https', netloc='docs.python.org', path='/3.6/library/', params='', query='', fragment='')
2- ./docs.python.org/3.6/library/
3- ./docs.python.org/3.6/library/index.html
./docs.python.org/3.6/library/index.html


In [28]:
"""
    파이썬은 파일하나를 모듈로 취급한다면 다른 파일의 함수를 복사하지 않고 바로 호출한다.

    [주의] import Ex07_alldown1 코드부터 에러발생하지만 실행은 됨

"""


from bs4 import BeautifulSoup
from urllib.parse import *
from urllib.request import *
import os, time, re

# 에러 발생해도 실행은됨
import Ex07_alldown1
import Ex07_alldown2


# 이미 처리한 파일인지 확인하기 위한 변수
proc_files = {}

# HTML을 다운받고 분석하는 함수
def analyze_html(url, root_url):
    # ------------------------------------------------------
    savepath = download_file(url)
    if savepath is None:
        return
    if savepath in proc_files : 
        return
    proc_files[savepath] = True
#   print(proc_files)
    
    f = open(savepath,'r',encoding='utf-8')
    html = f.read()
    links = enum_links(html,url)
#     print(links)
    for url_link in links:
        if url_link.find(root_url) != 0 :
            continue
        if re.search(".html$",url_link):
            analyze_html(url_link,root_url) # 재귀호출
            continue
        download_file(url_link)
    
if __name__ == "__main__":
    # URL에 있는 모든 것 다운받기
    url = "https://docs.python.org/3.5/library/"
    analyze_html(url, url)

1- ParseResult(scheme='https', netloc='docs.python.org', path='/3.5/library/', params='', query='', fragment='')
2- ./docs.python.org/3.5/library/
3- ./docs.python.org/3.5/library/index.html
https://docs.python.org/3.5/genindex.html
https://docs.python.org/3.5/py-modindex.html
https://docs.python.org/3.5/library/intro.html
https://docs.python.org/3.5/reference/grammar.html
https://www.python.org/
https://docs.python.org/3.5/index.html
https://docs.python.org/3.5/library/#the-python-standard-library
https://docs.python.org/3.5/reference/index.html#reference-index
https://pypi.python.org/pypi
https://docs.python.org/3.5/library/intro.html
https://docs.python.org/3.5/library/functions.html
https://docs.python.org/3.5/library/constants.html
https://docs.python.org/3.5/library/constants.html#constants-added-by-the-site-module
https://docs.python.org/3.5/library/stdtypes.html
https://docs.python.org/3.5/library/stdtypes.html#truth-value-testing
https://docs.python.org/3.5/library/stdtypes.ht