In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import logging
import re
from urllib.parse import urljoin
import json

def get_document_type(driver):
    try:
        # listAll 변수 찾기
        script = "return listAll;"
        list_all = driver.execute_script(script)
        if list_all and len(list_all) > 0:
            return list_all[0][2]  # 문서 타입 반환 (PDF or XSL)
    except:
        return None
    return None

def get_real_document_url(viewer_url):
    try:
        # Chrome 옵션 설정
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")

        # Chrome 드라이버 설정
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        # 페이지 로드
        driver.get(viewer_url)
        
        # 문서 타입 확인
        doc_type = get_document_type(driver)
        print(f"문서 타입: {doc_type}")

        if doc_type == 'XSL':
            try:
                # iframe으로 이동하기 전에 listAll 데이터 가져오기
                list_all = driver.execute_script("return listAll;")
                if list_all and len(list_all) > 0:
                    doc_name = list_all[0][0]  # 문서 이름
                    doc_id = list_all[0][1]    # 문서 ID
                    
                    doc_url = f"https://pcms.samsunglife.com/partnerpage/DocsXEditPrintAgent.do?id={doc_id}&code=&name={doc_name}"
                    print(f"XSL 문서 URL 찾음: {doc_url}")
                    return doc_url
            except Exception as e:
                print(f"XSL 문서 URL 추출 중 오류: {str(e)}")
                return None
        
        # iframe 요소 찾기
        iframe = driver.find_element(By.ID, 'viewerFrame')
        src = iframe.get_attribute('src')
        
        if not src:
            print("iframe의 src 속성을 찾을 수 없습니다")
            return None
        
        # src URL 절대 경로 변환
        iframe_url = urljoin('https://pcms.samsunglife.com', src)
        print(f"iframe URL: {iframe_url}")
        
        # iframe 페이지로 이동
        driver.get(iframe_url)
        page_source = driver.page_source

        if doc_type == 'PDF':
            # PDF 문서 처리 (기존 코드 유지)
            patterns = [
                r'"filepath"\s*:\s*[\'"](.+?\.pdf)[\'"]',
                r'"downloadURL"\s*:\s*[\'"](.+?\.pdf)[\'"]'
            ]
            
            for pattern in patterns:
                match = re.search(pattern, page_source)
                if match:
                    pdf_path = match.group(1)
                    doc_url = urljoin('https://pcms.samsunglife.com', pdf_path)
                    print(f"PDF 다운로드 URL 찾음: {doc_url}")
                    return doc_url

        print("문서 URL을 찾을 수 없음")
        return None
        
    except Exception as e:
        print(f"문서 URL 추출 중 오류 발생: {str(e)}")
        return None
    finally:
        driver.quit()

# 사용 예시
# viewer_url = "https://pcms.samsunglife.com/partnerpage/CustomerPage_Corp.jsp?title=%EC%82%BC%EC%84%B1%EC%83%9D%EB%AA%85%20%EA%B0%9C%EC%9D%B8%ED%98%95%20%ED%87%B4%EC%A7%81%EC%97%B0%EA%B8%88%EB%B3%B4%ED%97%98(%EA%B0%9C%EC%9D%B8%ED%98%95,%EB%AC%B4%EB%B0%B0%EB%8B%B9)%20&path=/uploadDir/corp/2012/1101/701/&fname=1351665972367-701-.pdf&pageGubun=prdt"
viewer_url = "https://pcms.samsunglife.com/partnerpage/CustomerPage_Unit.jsp?goodsCode=LP0345010&docType=301&saleDate=20240401&pageGubun=prdt"
# viewer_url = "https://pcms.samsunglife.com/partnerpage/CustomerPage_Unit.jsp?goodsCode=03510&docType=401&saleDate=20001110&pageGubun=prdt"
doc_url = get_real_document_url(viewer_url)
print(doc_url)
# 문서 타입: XSL
# XSL 문서 URL 찾음: https://pcms.samsunglife.com/partnerpage/DocsXEditPrintAgent.do?id=1074045873890&code=&name=무배당비즈라이프플랜보험(매년갱신형) 사업방법서
# https://pcms.samsunglife.com/partnerpage/DocsXEditPrintAgent.do?id=1074045873890&code=&name=무배당비즈라이프플랜보험(매년갱신형) 사업방법서


문서 타입: None
문서 URL 추출 중 오류 발생: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="viewerFrame"]"}
  (Session info: chrome=131.0.6778.85); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
#0 0x5556ea46b31a <unknown>
#1 0x5556e9f816e0 <unknown>
#2 0x5556e9fd03e6 <unknown>
#3 0x5556e9fd0681 <unknown>
#4 0x5556ea015b04 <unknown>
#5 0x5556e9ff448d <unknown>
#6 0x5556ea012ed7 <unknown>
#7 0x5556e9ff4203 <unknown>
#8 0x5556e9fc2cc0 <unknown>
#9 0x5556e9fc3c9e <unknown>
#10 0x5556ea438d0b <unknown>
#11 0x5556ea43cc92 <unknown>
#12 0x5556ea425b3c <unknown>
#13 0x5556ea43d807 <unknown>
#14 0x5556ea40b0df <unknown>
#15 0x5556ea45a578 <unknown>
#16 0x5556ea45a740 <unknown>
#17 0x5556ea46a196 <unknown>
#18 0x7f0169f4eac3 <unknown>

None
