## Using Seleium to crawl website directly

In [None]:
!pip install -U selenium

In [1066]:
from __future__ import annotations
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from typing import Any, Dict, List, Optional
from pydantic import BaseModel,Extra
from datetime import datetime
from selenium.common import exceptions  
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re


In [1067]:
# Using googal Chrome
options = Options()

# options.add_argument("--headless=new") #running in headless mode
driver = webdriver.Chrome(options=options)

#Using firefox 
# options = webdriver.FirefoxOptions()
# driver = webdriver.Firefox(options=options)

In [1068]:
import re
import os, json
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

    


class CrawlerWorkder(BaseModel):
    url_map:Dict[str,int]   ## {'url':depth}
    url_processed:Dict[str,int] = {}
    driver:Any
    max_depth:int = 1 
    content:List[Any] = []
    save_path:str = 'temp_webcontent/'
    prefix:str = None
    body_id:str
    extract_wiki_href:bool = False
    offset:int = 10
    url_validate_callback: Optional[Any] = None
    class Config:
        """Configuration for this pydantic object."""
        extra = Extra.forbid


    def _savefile(self):
        os.makedirs(os.path.dirname(f'{self.save_path}{self.prefix}/'), exist_ok=True)
        str_date = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") 
        with open(f'{self.save_path}{self.prefix}/file-{str_date}.wiki','w') as f:
            f.write(json.dumps(self.content,ensure_ascii=False))


    ## open wiki page in source code mode, to parse hrefs
    def _find_hrefs(self,url:str) -> List[Any]:
        suffix = '/WebHome?viewer=code'
        driver.get(url+suffix)
        current_hrefs = []
        try:
            main_container = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, self.body_id)))
        except exceptions.TimeoutException as e:
            print (f"{url} error: timeout, please check the div ID, or network connection")
            return current_hrefs
        if not main_container:
            return current_hrefs
        text = main_container.text
        it = re.finditer(r'\[\[([^\]]+)>>((url:)?https?:\/\/[^\]]+)\]\]',text,re.M)
        p = re.compile(r'(\s)+')
        for match in it: 
            a,b = match.span()
            offset_index = 0 if a - self.offset < 0 else a - self.offset
            # print(text[offset_index:a])
            key = re.sub(r'(.*)]]','',text[offset_index:a])

            # excluded case such as: [[ >>https://ec2-maitre-d-prod-iad.iad.proxy.amazon.com/]]
            if not p.match(match.group(1)):
                current_hrefs.append([key,match.group(1),match.group(2)])
        return current_hrefs
        

    def _call(self,url:str,current_depth:int) ->List [List[Any],Dict[str,int]]:
        """crawl website infor"""
        # text = ['Purpose','Region Build Automation Framework','1 - Complete Prerequisites Guide']
        # url_map = {'https://w.amazon.com/bin/view/AWSRegionBuildEngineering/RIP/FAQ':current_depth+1}
        text = []
        current_url_map = {}
        print(f'processing:{url}   depth:{current_depth}')
        self.url_processed[url] = current_depth
        driver.get(url)
        # driver.implicitly_wait(1)
        try:
            main_container = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, self.body_id)))
        except exceptions.TimeoutException as e:
            print (f"{url} error: timeout, please check the div ID, or network connection")
            return text,current_url_map
        # main_container = driver.find_element(By.ID, self.body_id)
        if not main_container:
            return text,current_url_map
        origintext = main_container.text
        ##获取下一级的链接
        links = main_container.find_elements(By.TAG_NAME,'a')
        for link in links:
            ## 跳过空的链接
            try: 
                if not link.text:
                    continue
            except exceptions.StaleElementReferenceException as e:
                print(e)
                continue
            href = link.get_attribute('href')
            if not href:
                continue
            ##检查url
            if self.url_validate_callback:
                if not self.url_validate_callback(href):
                    continue
            current_url_map[href]=current_depth+1
         
        ## 提取wiki中的链接，转成md格式
        if self.extract_wiki_href:
            current_hrefs = self._find_hrefs(url)
            for a,b,c in current_hrefs:
                c = c.replace('url:','')
                if origintext.find(f'{a}{b}'):
                    origintext = origintext.replace(f'{a}{b}',f'{a}[{b}]({c})',1)
                    print (f'{a}{b} ==> {a}[{b}]({c})')
        text.append(origintext) 
        
        return text,current_url_map

    def destroy(self):
        print('quite driver')
        self.driver.quit()

    def start(self) ->List[Any]:
        self.prefix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 
        while 1:
            if len(self.url_map) == 0:
                break
            url, depth = self.url_map.popitem()
            url = url.rstrip('/')
            if url in self.url_processed:
                continue
            # print(len(self.url_map))
            ##如果不超过最大depth则抓取
            if depth <= self.max_depth: 
                text,new_url_map = self._call(url,depth)
                if len(text):
                    self.content.append({url:text})
                #检查url是否重复
                if url in new_url_map:
                    del new_url_map[url]
                #合并新的
                self.url_map = {**self.url_map,**new_url_map}
            self._savefile()
        print(f'processed urls:{self.url_processed}')
        return self.content


                


        
    

In [1069]:

def validate_wiki_url(url):
    # pattern = re.compile(r'^https?://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?$')
    pattern = re.compile(r'^https://w(iki)?.amazon.com/bin/view(?:/[^\s#]*)?$')
    matched  = re.match(pattern, url) 
    if not matched:
        return False 
    ## exclude ticket
    elif url.startswith('https://t.corp.amazon.com'):
        return False
    elif url.find('WebHome') > -1:
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Main',url,re.I):
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/KnowledgeTech',url,re.I):
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/WikiManager',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/AmazonWiki/Wiki/Help',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/XWiki',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Users/',url,re.I): 
        return False
    elif re.match(r'https://w(iki)?.amazon.com/bin/view/Bindles/',url,re.I): 
        return False
    return True
    

In [1070]:
# print(validate_url('https://w.amazon.com/bin/create/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance'))
# print(validate_url('https://w.amazon.com/bin/view/RegionBuildAutomation/FAQ'))
# print(validate_url('https://wiki.amazon.com/bin/viewrev/RegionBuildAutomation/FAQ/WebHome?viewer=code&rev=33.1'))
# print(validate_url('https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance'))

In [1073]:
input_url = {'https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance':0}
# input_url = {'https://wiki.amazon.com/bin/view/RegionBuildAutomation/FAQ':0}
# input_url = {'https://w.amazon.com/bin/view/EC2_Capacity_Planning_-_External_Capacity_Runbook':0}

cwoker = CrawlerWorkder(url_map=input_url,
                        driver=driver,
                        max_depth=1,
                        url_validate_callback=validate_wiki_url,
                        body_id='mainContentArea',
                        extract_wiki_href=True
                        )

In [1074]:
content = cwoker.start()

processing:https://w.amazon.com/bin/view/AWS_GCR_GTMS/ServiceLaunch/FOOBguidance   depth:0
 via this link ==>  via this [link](https://ec2-maitre-d-prod-iad.iad.proxy.amazon.com/)
your team here  ==> your team [here ](https://permissions.amazon.com/a/team/Maitre%20D%20-%20Users)
' section here ==> ' section [here](https://w.amazon.com/bin/view/EC2/DemandShaping/MaitreD/UserGuide/)
ser guide here ==> ser guide [here](https://w.amazon.com/bin/view/EC2/DemandShaping/MaitreD/UserGuide/)
equest in MaitreD’ ==> equest in [MaitreD’](https://ec2-maitre-d-prod-iad.iad.proxy.amazon.com/batchCreate?noTrigger=true)
processing:https://w.amazon.com/bin/view/EC2_Capacity_Planning_-_External_Capacity_Runbook   depth:1
 via this link ==>  via this [link](https://ec2-maitre-d-prod-iad.iad.proxy.amazon.com/)
your team here  ==> your team [here ](https://permissions.amazon.com/a/team/Maitre%20D%20-%20Users)
' section here ==> ' section [here](https://w.amazon.com/bin/view/EC2/DemandShaping/MaitreD/UserGui

In [1065]:
driver.quit()

### 估算文档切分token

In [None]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
import tiktoken
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    # encoding = tiktoken.get_encoding(encoding_name)
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
doc = '/Users/chuanxie/Downloads/datalake/web/myapp/private-llm-qa-bot/notebooks/knowledge_build/temp_webcontent/2023-08-24-10-18-43/file-2023-08-24-10:20:06.wiki' ##FOOB Runbook 
# doc = 'file-2023-08-23-17:17:02.wiki' ## RBA FAQ
with open(doc,'r') as f:
    content = f.read()
content_json = json.loads(content)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(        
        chunk_size = 800,
        chunk_overlap  = 200,
        length_function = len,
)
texts = []
for page in content_json:
    for url,p in page.items():
        if len(p):
            print(f'page:{url}, size: {len(p[0])}')
            chunks = text_splitter.split_text(p[0])
            texts+=chunks

In [None]:
len(texts)

In [None]:
line = ''.join(['-']*150)
for text in texts:
    print(f'--tokens:{num_tokens_from_string(text)}-{line}')
    print(f'{text}\n\n')