# Download ServiceNow knowledge articles 
## Important pre-requisite 
Before you run this code, you should run start_remote_driver code. 
## What does it do?
This code will 
* connect to an existing driver where you have already logged on to ServiceNow  
* go the list of knowledge base articles 
* keep clicking on "Show More" till all articles are listed. 
* Make a list of all KB lines 

## Downloading and saving knowledge articles 
For each kb it will:  
* browse to the page, 
* download the content 
* Clean the content
* Download all referred to images and attachments and keep in locally 
* Change HTML code references to point to the local(relative) urls 


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
import time 
import os 
from bs4 import BeautifulSoup
import json 
import sys 
import requests 

## Add custom library to path 

In [2]:
parent = os.path.join('..', 'src') 
sys.path.append(parent)

In [3]:
import servicenow.selenium_scrape as sel 

## Get a remote session  (but call this one "driver")

In [4]:
# Save a few URLS 
okta_url = 'https://taihooncology.okta.com/'
svc_now_url='https://tizona.service-now.com/fna/'

In [5]:
sess = sel.SessionMgmt() 
driver = sess.get_remote_session() 
driver.get(svc_now_url)

## Shutdown the driver
Should cleanly shutdown the driver and os processes 

In [6]:
def shutdown(_driver):
    _driver.close() 
    _driver.stop_client()
    _driver.quit() 
#shutdown(driver)

## Testing 
Next two lines are part of sanity check 

In [6]:
x_path = "//*[contains(@ng-href,'id=kb_view2')]"
x_path="//span[text()='Taiho Oncology']"
elements = driver.find_elements(By.XPATH, x_path) 
len(elements)

1

In [7]:
elements[0].get_attribute('outerHTML')

'<span ng-if="::data.company_slogan" style="display:inline-block;" class="ng-binding ng-scope">Taiho Oncology</span>'

## Go to list of knowledge articles 
We need to dynamically extract this link from the previous page. This is a hack

In [7]:
url = "https://tizona.service-now.com/fna/?id=kb_category&kb_category=25e9fde64febde405165fc828110c7db"
driver.get(url)
time.sleep(5) 

## Load list of all knowledge articles 
This code will click on "Show More" button, wait for few seconds and 
then click again till it cannot find the button anymore. 

In [8]:
click_=True 
element = None 
counter = 0 
break_after = 100
while click_:
    if not (element is None):
        element.click() 
    print("Waiting for 5 seconds for page to fully load")
    time.sleep(5)
    x_path="//*[text()='Show More']"
    elements = driver.find_elements(By.CSS_SELECTOR, '.btn-loadmore') 
    click_= (len(elements)>0)
    if click_:
        element = elements[0] 
        counter +=1 
    if counter> break_after:
        break 
print("Done ...")

Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Waiting for 5 seconds for page to fully load
Done ...


## Get the list elements, convert to KB  objects
This will get the list of all elements containg knowledge articles. 

In [9]:
x_path = "//*[contains(@ng-href,'id=kb_article')]"
elements = driver.find_elements(By.XPATH, x_path) 
kb = [sel.KBArticleLine(element, svc_now_url, i) for i, element in enumerate(elements)]
for k in kb:
    k.set_cookies(driver.get_cookies())
len(kb)

67

In [11]:
#driver.get_cookies()

## Save each article 
Goes to the page for each article, downloads and saves. Typically 6-7 seconds per article. 

In [12]:
sel.make_db(kb[0:3], driver)

waiting for 5 seconds for https://tizona.service-now.com/fna/?id=kb_article&sys_id=a8f46e09871941d080dee58e0ebb35b2 to load
waiting for 5 seconds for https://tizona.service-now.com/fna/?id=kb_article&sys_id=2aa42609871941d080dee58e0ebb359d to load
waiting for 5 seconds for https://tizona.service-now.com/fna/?id=kb_article&sys_id=752025b0dbd1c1900d24138b4b9619a4 to load
No filename found in url:#
Headers:{'X-Frame-Options': 'SAMEORIGIN', 'X-Is-Logged-In': 'true', 'X-Transaction-ID': '97a577898719', 'Set-Cookie': 'glide_session_store=88253789871981D080DEE58E0EBB3584; Max-Age=3600; Expires=Tue, 25-Jan-2022 20:28:50 GMT; Path=/; HttpOnly; SameSite=None; Secure', 'Pragma': 'no-store,no-cache', 'Cache-Control': 'no-cache,no-store,must-revalidate,max-age=-1', 'Expires': '0', 'Referrer-Policy': 'same-origin', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Date': 'Tue, 25 Jan 2022 19:28:50 GMT', 'Server': 'ServiceNow', 'Strict-Transport-S

## And we are done! 
The rest of the code is for playing with different things. 

In [None]:
kb = kb[1]
kb.href

In [None]:
full_url = base_url+kb.href
full_url

In [None]:
driver.get(full_url) 

In [None]:
elements = driver.find_elements(By.CSS_SELECTOR, '.panel-body')

In [None]:
soup = parse_html(elements[1].get_attribute('innerHTML'))

In [None]:
for img in soup.find_all('img'):
    src = img['src'] 
    print( src )
    if src.startswith('/'):
        img['src'] = base_url+ src 

In [None]:
for img in soup.find_all('img'):
    src = img['src'] 
    print( src )

In [None]:
title_header = soup.find_all("h2" , {"class":"kb-title-header"})

In [None]:
len(title_header)

In [None]:
title_header[0].string = "<a href='http://google.com'>{}</a>".format(title_header[0].text)

In [None]:
k=kb[1] 
link=k.link 
driver.get(link)

In [None]:
k.find_article_element(driver)

In [None]:
for img in k.content_soup.find_all('img'):
    print(img['src'])

In [None]:
cookies = {} 
for cookie in driver.get_cookies():
    cookies[cookie['name']] = cookie['value']

In [None]:
img_src =k.base_url + img['src'] 
r = requests.get( img_src, cookies=cookies)

In [None]:
r.headers['Content-Disposition'].split(';')[1].strip().split(

In [None]:
def get_filename(response):
    cdisp = response.headers['Content-Disposition'] 
    parts = cdisp.split(';') 
    fname = 'filename='
    for part in parts:
        part = part.strip() 
        if part.startswith(fname):
            return part[len(fname):].strip() 
    return None 

In [None]:
get_filename(r)

In [None]:
all_as= k.content_soup.find_all('a')

In [None]:
driver.get(k.link)

In [None]:
elem = driver.page_source

In [None]:
soup = BeautifulSoup(elem)

In [None]:
for img in soup.find_all('img'):
    try:
        print(img['src'])
    except e:
        pass 

In [None]:
all_as = soup.find_all('a')
len(all_as)

In [None]:
for a in all_as:
    try:
        print(a['href'])
    except :
        pass 

In [None]:
attachments = soup.find_all("li", {"class":"attached-file"})
len(attachments)
a='https://tizona.service-now.com/sys_attachment.do?sys_id=1cc41e39dba089d00d24138b4b961904' 
attachments[0]

In [None]:
def find_attachement_links(body_soup):
    attachments = soup.find_all("li", {"class":"attached-file"})
    links = [] 
    for attachment in attachments:
        a = attachment.find_all('a')
        for each_a in a:
            links.append(each_a['ng-href'])
    return links 

In [None]:
links = find_attachement_links(soup)
for link in links:
    print(link)

In [None]:
def download_attachment(link, base_url, dirname,  driver):
    cookies = {} 
    for cookie in driver.get_cookies():
        cookies[cookie['name']] = cookie['value']
    url = base_url[:-1] + link[:-10]  
    r = requests.get( url, cookies=cookies)
    
    print("Link:{}, status code:{}".format( url, r.status_code)) 
    return r

In [None]:
r = download_attachment(links[0], k.base_url, '', driver)

In [None]:
r.headers

In [None]:
responses = [download_attachment(link, k.base_url, '', driver) for link in links]

In [None]:
responses[3].headers

In [None]:
len(r.content)

In [None]:
with open('a.pdf', 'wb') as f:
    f.write(responses[0].content)

In [None]:
r=download_attachment('sys_attachment.do?sys_id=1cc41e39dba089d00d24138b4b961904&amp;view=true', k.base_url, '', driver)

In [None]:
r.headers

In [None]:
fname = get_filename(r)
print(fname[1:-1])

In [None]:
with open(fname[1:-1], 'wb') as f:
    f.write(r.content)