In [38]:
from langchain.document_loaders import UnstructuredPDFLoader
from bs4 import BeautifulSoup
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.docstore.document import Document
import re
import json
import html2markdown

In [20]:
def link_header(semantic_snippets):
    heading_fonts_arr = [ item.metadata['heading_font'] for item in semantic_snippets ]
    heading_arr = [ item.metadata['heading'] for item in semantic_snippets ]

    def fontsize_mapping(heading_fonts_arr):
        heading_fonts_set = list(set(heading_fonts_arr))
        heading_fonts_set.sort(reverse=True)
        idxs = range(len(heading_fonts_set))
        font_idx_mapping = dict(zip(heading_fonts_set,idxs))
        return font_idx_mapping
        
    fontsize_dict = fontsize_mapping(heading_fonts_arr)

    snippet_arr = []
    for idx, snippet in enumerate(semantic_snippets):
        font_size = heading_fonts_arr[idx]
        heading_stack = []
        heading_info = {"font_size":heading_fonts_arr[idx], "heading":heading_arr[idx], "fontsize_idx" : fontsize_dict[font_size]}
        heading_stack.append(heading_info)
        for id in range(0,idx)[::-1]:
            if font_size < heading_fonts_arr[id]:
                font_size = heading_fonts_arr[id]
                heading_info = {"font_size":font_size, "heading":heading_arr[id], "fontsize_idx" : fontsize_dict[font_size]}
                heading_stack.append(heading_info)
            
        snippet_info = {
            "heading" : heading_stack,
            "content" : snippet.page_content
        }
        snippet_arr.append(snippet_info)
        
    json_arr = json.dumps(snippet_arr, ensure_ascii=False)
    return json_arr

In [102]:
def parse_pdf_to_json(file_content):
    soup = BeautifulSoup(file_content,'html.parser')
    content = soup.find_all('div')

    cur_fs = None
    cur_text = ''
    snippets = []   # first collect all snippets that have the same font size
    for c in content:
        td = c.find('tr')
        if td:
            print(td)

        sp = c.find('span')
        if not sp:
            continue
        st = sp.get('style')
        if not st:
            continue
        fs = re.findall('font-size:(\d+)px',st)
        if not fs:
            continue
        fs = int(fs[0])
        if not cur_fs:
            cur_fs = fs
        if fs == cur_fs:
            cur_text += c.text
        else:
            snippets.append((cur_text,cur_fs))
            cur_fs = fs
            cur_text = c.text
    snippets.append((cur_text,cur_fs))

    cur_idx = -1
    semantic_snippets = []
    # Assumption: headings have higher font size than their respective content
    for s in snippets:
        # if current snippet's font size > previous section's heading => it is a new heading
        if not semantic_snippets or s[1] > semantic_snippets[cur_idx].metadata['heading_font']:
            metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
            #metadata.update(data.metadata)
            semantic_snippets.append(Document(page_content='',metadata=metadata))
            cur_idx += 1
            continue
        
        # if current snippet's font size <= previous section's content => content belongs to the same section (one can also create
        # a tree like structure for sub sections if needed but that may require some more thinking and may be data specific)
        if not semantic_snippets[cur_idx].metadata['content_font'] or s[1] <= semantic_snippets[cur_idx].metadata['content_font']:
            semantic_snippets[cur_idx].page_content += s[0]
            semantic_snippets[cur_idx].metadata['content_font'] = max(s[1], semantic_snippets[cur_idx].metadata['content_font'])
            continue
        
        # if current snippet's font size > previous section's content but less tha previous section's heading than also make a new 
        # section (e.g. title of a pdf will have the highest font size but we don't want it to subsume all sections)
        metadata={'heading':s[0], 'content_font': 0, 'heading_font': s[1]}
        #metadata.update(data.metadata)
        semantic_snippets.append(Document(page_content='',metadata=metadata))
        cur_idx += 1

    json_content = link_header(semantic_snippets)
    return json_content

In [103]:
# pdf_path='~/Downloads/LLM/wiki/Network and Edge (GCR_SSO.Network.WebHome) - XWiki.pdf'
pdf_path='~/Downloads/LLM/wiki/FOOBquickguide.pdf'
loader = PDFMinerPDFasHTMLLoader(pdf_path)
file_content = loader.load()[0].page_content
json_content = parse_pdf_to_json(file_content)

In [104]:
content = json.loads(json_content)

In [114]:
content[4]

{'heading': [{'font_size': 13, 'heading': '！！注意事项！！：\n', 'fontsize_idx': 2},
  {'font_size': 25,
   'heading': '1. FOOB Ticket快速指南 (For EC2)\n1.1 步骤\n',
   'fontsize_idx': 0}],
 'content': "如果申请超过 $500K 或者 3 racks,则需要Sales L8 Director 或者 local business L8 Director的ap‐\nproval（在Ticket中附上approval邮件截图即可）。\n即使FOOB申请被capacity team approved了，仍然有⼀定的⻛险会延迟交付 “While Capacity\nPlanning does its best to use forecasts and levers to deliver on requests, we cannot make any\n'guarantees' on delivering capacity on a future date.”\n请保持关注提交的Ticket，并及时响应处理⼈员提出的question。\n再三检查MatrieD的信息是否填写正确，如instance type，AZ， NBD，Instance Eligibility等\nCapacity预留conﬁrm之后，请提醒客户在Need By Date之前在后台接受，否则资源会被回收。并在\nTicket中回复客户是否接受，Close the Loop。\n"}

## Using Seleium to crawl website directly

In [117]:
!pip install -U selenium



In [417]:
from __future__ import annotations
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from typing import Any, Dict, List, Optional
from pydantic import BaseModel,Extra
from datetime import datetime
from selenium.common import exceptions  

In [539]:
#Using googal Chrome
# options = Options()
# options.add_argument("--headless=new")
# driver = webdriver.Chrome(options=options)

#Midway 权限问题，可以Using firefox 
options = webdriver.FirefoxOptions()
driver = webdriver.Firefox(options=options)

In [540]:
import re
import os, json

    


class CrawlerWorkder(BaseModel):
    url_map:Dict[str,int]   ## {'url':depth}
    driver:Any
    max_depth:int = 1 
    content:List[Any] = []
    save_path:str = 'temp_webcontent/'
    prefix:str = None
    body_id:str
    url_validate_callback: Optional[Any] = None
    class Config:
        """Configuration for this pydantic object."""
        extra = Extra.forbid

    @property
    def _url_maps(self) -> Dict[str,int]:
        return self.url_map
        """return ulr_maps"""


    def _savefile(self):
        os.makedirs(os.path.dirname(f'{self.save_path}{self.prefix}/'), exist_ok=True)
        str_date = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") 
        with open(f'{self.save_path}{self.prefix}/file-{str_date}.wiki','w') as f:
            f.write(json.dumps(self.content,ensure_ascii=False))


    def _call(self,url:str,current_depth:int) ->List [List[str],Dict[str,int]]:
        """crawl website infor"""
        # text = ['Purpose','Region Build Automation Framework','1 - Complete Prerequisites Guide']
        # url_map = {'https://w.amazon.com/bin/view/AWSRegionBuildEngineering/RIP/FAQ':current_depth+1}
        text = []
        url_map = {}
        print(f'processing:{url}   depth:{current_depth}')
        driver.get(url)
        driver.implicitly_wait(1)
        main_container = driver.find_elements(By.ID, self.body_id)
        if not len(main_container):
            return text,url_map
        for e in main_container:
            text.append(e.text)
        links = driver.find_elements(By.TAG_NAME,'a')
        for link in links:
            ## 跳过空的链接
            try: 
                if not link.text:
                    continue
            except exceptions.StaleElementReferenceException as e:
                print(e)
                continue
            href = link.get_attribute('href')
            if not href:
                continue
            ##检查url
            if self.url_validate_callback:
                if not self.url_validate_callback(href):
                    continue
            url_map[href]=current_depth+1
            # print(f"Text: {link.text}, URL: {href}")
        return text,url_map

    def destroy(self):
        print('quite driver')
        self.driver.quit()

    def start(self) ->List[Any]:
        self.prefix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 
        while 1:
            if len(self.url_map) == 0:
                break
            url, depth = self.url_map.popitem()
            print(len(self.url_map),self.url_map)
            ##如果不超过最大depth则抓取
            if depth <= self.max_depth: 
                text,new_url_map = self._call(url,depth)
                self.content.append({url:text})
                #检查url是否重复
                if url in new_url_map:
                    del new_url_map[url]
                #合并新的
                self.url_map = {**self.url_map,**new_url_map}
            self._savefile()
        return self.content


                


        
    

In [541]:

def validate_url(url):
    # pattern = re.compile(r'^https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?$')
    pattern = re.compile(r'^https://w(iki)?.amazon.com/bin(?:/[^\s#]*)?$')
    matched  = re.match(pattern, url) 
    if not matched:
        return False 
    ## exclude ticket
    elif url.startswith('https://t.corp.amazon.com'):
        return False
    elif url.find('WebHome') > -1:
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Main',url,re.I):
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/KnowledgeTech',url,re.I):
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/WikiManager',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/AmazonWiki/Wiki/Help',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/XWiki',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Users/',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Bindles/',url,re.I): 
        return False
    return True
    

In [542]:
print(validate_url('https://wiki.amazon.com/bin/view/RegionBuildAutomation/FAQ'))
print(validate_url('https://w.amazon.com/bin/view/RegionBuildAutomation/FAQ'))
print(validate_url('https://wiki.amazon.com/bin/viewrev/RegionBuildAutomation/FAQ/WebHome?viewer=code&rev=33.1'))
print(validate_url('https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance'))

True
True
False
True


In [548]:
# url = 'https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance'
# url = 'https://wiki.amazon.com/bin/view/RegionBuildAutomation/FAQ'
url = 'https://w.amazon.com/bin/view/EC2_Capacity_Planning_-_External_Capacity_Runbook'
cwoker = CrawlerWorkder(url_map={url:0},driver=driver,max_depth=1,url_validate_callback=validate_url,body_id='contentcontainer')

In [549]:
content = cwoker.start()

0 {}
processing:https://w.amazon.com/bin/view/EC2_Capacity_Planning_-_External_Capacity_Runbook   depth:0
15 {'https://w.amazon.com/bin/view/EC2_Capacity_Planning_-_External_Capacity_Runbook/': 1, 'https://w.amazon.com/bin/view/EC2_Capacity_Planning/Customer_Planning_Group/': 1, 'https://w.amazon.com/bin/view/EC2_capacity_faqs/': 1, 'https://w.amazon.com/bin/view/EC2_Capacity_Planning/AWS_Internal_Planning/': 1, 'https://w.amazon.com/bin/view/EC2_Capacity_Planning/CDO_Native/': 1, 'https://w.amazon.com/bin/view/EC2_Capacity_Planning/Customer_Planning_Group/Customer_Planning_Resources/': 1, 'https://w.amazon.com/bin/view/EC2_Capacity_Planning/Customer_Planning_Group/Customer_Planning_Resources/AWS-Assisted_CR/': 1, 'https://w.amazon.com/bin/view/EC2/DemandShaping/MaitreD/UserGuide/': 1, 'https://w.amazon.com/bin/view/EC2/DemandShaping/MaitreD/UserGuide/LaunchAnnouncements/targetedOdcr/': 1, 'https://w.amazon.com/bin/view/AWS/Teams/Core_Services/EC2_Capacity_Escalation_Approvals/': 1, 'h

In [550]:
elements = driver.find_elements(By.ID,'contentcontainer')

In [551]:
list(content[0].values())[0]

['External Capacity Runbook\nView Source\nTools\n<\nCustomer Planning Group\nExternal Capacity Planning Runbook\nCPG Home FAQs\nExternal Capacity Planning Runbook AWS Internal Capacity Planning Runbook CDO Native Capacity Planning Runbook Internal Team Resources\nObjective\nThe purpose of this Runbook is to provide a step by step operating guideline for TAM or the account teams to request additional EC2 capacity across any AZ globally for AWS customers. \nOverview\nAWS uses historical usage data, organic demand signals, and manual complements to our forecast in order to anticipate customer requirements and ensure that we provision enough capacity. We are constantly working to expand the capacity within our data centers, the physical capacity of our availability zones and regions, and our global geographical coverage (new regions).\n\nEC2 Capacity Planning assists in preparing the future capacity needs of external customers, adds them into AWS demand signals through the FOOB process, an

In [547]:
with open('temp_webcontent/2023-08-24-09-57-50/file-2023-08-24-09:58:42.wiki','r') as f:
    print(json.loads(f.read()))

[{'https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance': ["AWS_GCR_GTMSServiceLaunchFOOB 提交快速指南\nFOOB 提交快速指南\nPrimary Contact chuanxie (user) How do I change this value?\nLast modified 16 hours ago by chuanxie.\nView Source Edit Edit Source\nTools\n1. FOOB Ticket快速指南 (For EC2)\n1.1 步骤\na) 什么场景需要提FOOB？\n新增的需求，如客户迁移新的workload，workload突增（6/18，11/11等），客户计划的机型升级（例如C4->C5， x86→Gravition) , Region切换等\n有些特殊机型如GPU/Inf1实例，U型实例，可提前去看下Baywatch现在的剩余资源情况。\n另外全球高端GPU资源很紧张，P4/P5 机型的申请，除了要提FOOB，还需要额外流程。见1.3 P4/P5附加流程 \nb) 提前多久时间？\n需要提前8周提交。建议先去baywatch查看一下可用的capacity是否有剩余，如果free slots较多，则fullfill时间会较短些，如果slots很少，则可能需要涉及采购新硬件, 需要预留足够的buffer时间。\nc) FOOB ticket 提交模板\n需求类型 quicklink 相应SLA 提交时请填写自己base地对应的location，例如您base在PEK17，就填PEK17\n普通EC2需求（包括GPU） https://t.corp.amazon.com/create/templates/a55f20c1-8bda-447d-86b4-5172493fa7a0 72小时（Sea工作日时间）\nHigh Memory实例 (U-6tb/U-9tb/U-12tb等) https://t.corp.amazon.com/create/templates/8a4fd90d-791d-48c2-88ad-bdfcd75d3f49  \nHigh Memory bare metal实例（u-

In [333]:
cwoker.content

[{'https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance': ["AWS_GCR_GTMSServiceLaunchFOOB 提交快速指南\nFOOB 提交快速指南\nPrimary Contact chuanxie (user) How do I change this value?\nLast modified 20 days ago by chuanxie.\nView Source Edit Edit Source\nTools\n1. FOOB Ticket快速指南 (For EC2)\n1.1 步骤\na) 什么场景需要提FOOB？\n新增的需求，如客户迁移新的workload，workload突增（6/18，11/11等），客户计划的机型升级（例如C4->C5， x86→Gravition) , Region切换等\n有些特殊机型如GPU/Inf1实例，U型实例，可提前去看下Baywatch现在的剩余资源情况。\n另外全球高端GPU资源很紧张，P4/P5 机型的申请，除了要提FOOB，还需要额外流程。见1.3 P4/P5附加流程 \nb) 提前多久时间？\n需要提前8周提交。建议先去baywatch查看一下可用的capacity是否有剩余，如果free slots较多，则fullfill时间会较短些，如果slots很少，则可能需要涉及采购新硬件, 需要预留足够的buffer时间。\nc) FOOB ticket 提交模板\n需求类型 quicklink 相应SLA 提交时请填写自己base地对应的location，例如您base在PEK17，就填PEK17\n普通EC2需求（包括GPU） https://t.corp.amazon.com/create/templates/a55f20c1-8bda-447d-86b4-5172493fa7a0 72小时（Sea工作日时间）\nHigh Memory实例 (U-6tb/U-9tb/U-12tb等) https://t.corp.amazon.com/create/templates/8a4fd90d-791d-48c2-88ad-bdfcd75d3f49  \nHigh Memory bare metal实例（u-6

In [536]:
driver.quit()

In [282]:
url = 'https://w.amazon.com/bin/view/RegionBuildAutomation/Guidelines'

In [283]:
driver.get(url)

In [284]:
driver.implicitly_wait(1)

In [299]:
links = driver.find_elements(By.TAG_NAME,'a')
for link in links:
    href = link.get_attribute('href')
    text = link.text
    print(f"Text: {text}, URL: {href}")

Text: , URL: https://w.amazon.com/bin/view/Main
Text: Create a Page, URL: https://w.amazon.com/bin/create/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance
Text: User Profile, URL: https://w.amazon.com/bin/view/Users/chuanxie
Text: Wiki Help, URL: https://w.amazon.com/bin/view/AmazonWiki/Wiki/Help
Text: EN, URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: , URL: None
Text: Contributions, URL: https://w.amazon.com/bin/view/XWiki/UserContributions
Text: , URL: https://w.amazon.com/bin/view/XWiki/chuanxie
Text: , URL: https://w.amazon.com/bin/view/XWiki/chuanxie
Text: , URL: https://w.amazon.com/bin/view/WikiManager/
Text: , URL: https://w.amazon.com/bin/view/Main/AllDocs
Text: , URL: https://w.amazon.com/bin/view/Main/UserDirectory
Text: , URL: https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance#
Text: Take the Tour, URL: https://w.amazon.com/bin/view/M

In [296]:
main_container = driver.find_elements(By.ID, "contentcontainer")

In [297]:
main_container

[<selenium.webdriver.remote.webelement.WebElement (session="19874776c077df483fd05ebf10c0df25", element="E6E9D2C0D094BC996B213F1A3912632A_element_87")>]

In [298]:
for e in main_container:
    print(e.text)

AWS_GCR_GTMSServiceLaunchFOOB 提交快速指南
FOOB 提交快速指南
Primary Contact chuanxie (user) How do I change this value?
Last modified 20 days ago by chuanxie.
View Source Edit Edit Source
Tools
1. FOOB Ticket快速指南 (For EC2)
1.1 步骤
a) 什么场景需要提FOOB？
新增的需求，如客户迁移新的workload，workload突增（6/18，11/11等），客户计划的机型升级（例如C4->C5， x86→Gravition) , Region切换等
有些特殊机型如GPU/Inf1实例，U型实例，可提前去看下Baywatch现在的剩余资源情况。
另外全球高端GPU资源很紧张，P4/P5 机型的申请，除了要提FOOB，还需要额外流程。见1.3 P4/P5附加流程 
b) 提前多久时间？
需要提前8周提交。建议先去baywatch查看一下可用的capacity是否有剩余，如果free slots较多，则fullfill时间会较短些，如果slots很少，则可能需要涉及采购新硬件, 需要预留足够的buffer时间。
c) FOOB ticket 提交模板
需求类型 quicklink 相应SLA 提交时请填写自己base地对应的location，例如您base在PEK17，就填PEK17
普通EC2需求（包括GPU） https://t.corp.amazon.com/create/templates/a55f20c1-8bda-447d-86b4-5172493fa7a0 72小时（Sea工作日时间）
High Memory实例 (U-6tb/U-9tb/U-12tb等) https://t.corp.amazon.com/create/templates/8a4fd90d-791d-48c2-88ad-bdfcd75d3f49  
High Memory bare metal实例（u-6tb1.metal, u-9tb1.metal, u-12tb1.metal, u-18tb1.metal, u-24tb1.metal https://t.corp.amazon.com/

In [186]:
driver.quit()