# Scraping for vulnerable binaries

### Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException

from pyvirtualdisplay import Display


### Defining some functions

In [2]:
def extract_pkgs_of_one_vulnerability():
# Locate "Ubuntu 22.04" and retreive the links of the packages
    try:
        ul_elem = driver.find_element_by_xpath("//h5[contains(text(), 'Ubuntu 22.04')]/following-sibling::ul")
    except NoSuchElementException:
        return
    
    pkg_list_elem = ul_elem.find_elements(By.XPATH, '*')
    num_pkg = len(pkg_list_elem)
    print("Number of packages to check: ", num_pkg)

    for i in range(num_pkg):
        # relocate the elements
        vuln_pkg = {}
        li_item = pkg_list_elem[i]
        a_tag_item = li_item.find_elements(By.XPATH, '*')[0]
        #a_tag_item = li_item.find_elements(By.XPATH, '*[1]')
        #print(a_tag_item)
        #print(a_tag_item.text)
        vuln_pkg['name'] = a_tag_item.text
        try:
            a_tag_item.click()
        except ElementClickInterceptedException:
            # just try again
            loc = a_tag_item.location_once_scrolled_into_view
            a_tag_item.click()

        # Now on the Launchpad page
        # Locate the Jammy jellyfish and get the parent element
        try:
            jammy_location = driver.find_element_by_xpath("//a[contains(text(), 'The Jammy Jellyfish')]")
            jammy_parent_tr = jammy_location.find_element(By.XPATH, '..').find_element(By.XPATH, '..')

            first_secure_tr = jammy_parent_tr.find_element(By.XPATH, "following-sibling::*[1]")
            try:
                secure_td = first_secure_tr.find_element(By.XPATH, "*").find_element(By.XPATH, "//td[contains(text(), 'security')]")
                # Secure binary info if needed.
                vuln_tr = jammy_parent_tr.find_element(By.XPATH, "following-sibling::*[3]")
            except NoSuchElementException:
                print("Secure fix to ", vuln_pkg['name'], "not found!")
                driver.back()
                time.sleep(1)

                ul_elem = driver.find_element_by_xpath("//h5[contains(text(), 'Ubuntu 22.04')]/following-sibling::ul") 
                pkg_list_elem = ul_elem.find_elements(By.XPATH, '*')
                continue

            try:
                vuln_td = vuln_tr.find_element(By.XPATH, "*").find_element(By.XPATH, "//td[contains(text(), 'release')]")
                vuln_version_td = vuln_tr.find_element(By.XPATH, "*[1]").find_element(By.XPATH, "*[2]")
                vuln_pkg['version'] = vuln_version_td.text
                vuln_pkgs.append(vuln_pkg)
            except NoSuchElementException:
                print("No package version in the release version found! ", vuln_pkg['name'])
            #print(version)
        except NoSuchElementException:
            print("No jammy jellyfish found! ", vuln_pkg['name'])

        driver.back()
        time.sleep(1)

        ul_elem = driver.find_element_by_xpath("//h5[contains(text(), 'Ubuntu 22.04')]/following-sibling::ul") 
        pkg_list_elem = ul_elem.find_elements(By.XPATH, '*')


In [3]:
def scrape_one_result_page(first_item, last_item):
    for i in range(1, last_item - first_item + 2): 
        one_item = driver.find_elements_by_xpath('//*[@id="main-content"]/section[3]/div[3]/div/article[' + str(i) + ']/h3/a')
        #print(one_item[0].text)
        
        link = driver.find_element(by=By.LINK_TEXT, value=one_item[0].text)
        loc = link.location_once_scrolled_into_view

        try: 
            link.click()
        except ElementClickInterceptedException:
            # just try again (works most of the time but not always!)
            link.click()
            
        print("Page title is: ")
        print(driver.title)

        extract_pkgs_of_one_vulnerability()

        driver.back()
        time.sleep(1)

### Loading the main page, and selecting the distro

In [4]:
#display = Display(visible=0, size=pixel)
display = Display(visible=1)
display.start()

options = webdriver.ChromeOptions()
options.set_capability("goog:loggingPrefs", {'performance': 'ALL'})

driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver.get('https://ubuntu.com/security/notices')

# Wait for the page to load, then click the "accept all cookies" button

WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID,'cookie-policy-button-accept'))).click()


print("Page title is: ")
print(driver.title)

# Selecting the distribution from the dropdown menu
distro = "Ubuntu 22.04 LTS"
release_drop = Select(driver.find_element(By.ID, 'release'))
release_drop.select_by_visible_text(distro)

# Retreiving and clicking the 'Search button'
search_button = driver.find_element(By.XPATH, '//*[@id="main-content"]/section[2]/form/div/div[2]/div/div[2]/button')
driver.execute_script("arguments[0].click();", search_button)

print("Page title is: ")
print(driver.title)


Page title is: 
Security notices | Ubuntu
Page title is: 
Security notices | Ubuntu


### Initializing some variables from the first results page

In [5]:
# Getting the number of items of the current page
results_count = driver.find_elements(By.XPATH, '//*[@id="main-content"]/section[3]/div[1]/div[1]/h2')

print("Page results: ")
print(results_count[0].text)

page_num = 1 

results_list = results_count[0].text.split()
first_item_on_page = int(results_list[0]) 
last_item_on_page = int(results_list[2])
total_item_count = int(results_list[-2])
print("Number of total items :", total_item_count)
print("Number of items one the current page :", last_item_on_page)


Page results: 
1 - 10 of 707 results
Number of total items : 707
Number of items one the current page : 10


In [6]:
vuln_pkgs = []

### Main flow

In [None]:
# starting over from given page page_num
# Important: If running from the beginning (from page 1) comment until #----------------
page_num = 37

button = driver.find_element(By.LINK_TEXT, str(page_num))
loc = button.location_once_scrolled_into_view

try: 
    button.click()
except ElementClickInterceptedException:
    # just try again
    button.click()

time.sleep(1)

results_count = driver.find_elements(By.XPATH, '//*[@id="main-content"]/section[3]/div[1]/div[1]/h2')

results_list = results_count[0].text.split()
first_item_on_page = int(results_list[0]) 
last_item_on_page = int(results_list[2])

#----------------

while True:
    scrape_one_result_page(first_item_on_page, last_item_on_page)
    if last_item_on_page == total_item_count:
        break
        
    # locate the next page button
    page_num += 1
    button = driver.find_element(By.LINK_TEXT, str(page_num))
    loc = button.location_once_scrolled_into_view
    
    try: 
        button.click()
    except ElementClickInterceptedException:
        # just try again
        button.click()
    
    time.sleep(1)
    
    results_count = driver.find_elements(By.XPATH, '//*[@id="main-content"]/section[3]/div[1]/div[1]/h2')

    results_list = results_count[0].text.split()
    first_item_on_page = int(results_list[0]) 
    last_item_on_page = int(results_list[2])
    
    print("First item on current page :", first_item_on_page)
    print("Last item on current page :", last_item_on_page)
    

Page title is: 
USN-5977-1: Linux kernel (OEM) vulnerabilities | Ubuntu security notices | Ubuntu
Number of packages to check:  2
No package version in the release version found!  linux-image-6.0.0-1013-oem
No package version in the release version found!  linux-image-oem-22.04b
Page title is: 
USN-5976-1: Linux kernel (OEM) vulnerabilities | Ubuntu security notices | Ubuntu
Number of packages to check:  3
Page title is: 
LSN-0093-1: Kernel Live Patch Security Notice | Ubuntu security notices | Ubuntu
Page title is: 
USN-5972-1: Thunderbird vulnerabilities | Ubuntu security notices | Ubuntu
Number of packages to check:  1
Page title is: 
USN-5966-2: amanda regression | Ubuntu security notices | Ubuntu
Number of packages to check:  1
Page title is: 
USN-5966-1: amanda vulnerabilities | Ubuntu security notices | Ubuntu
Number of packages to check:  1
Page title is: 
USN-5968-1: GitPython vulnerability | Ubuntu security notices | Ubuntu
Number of packages to check:  1
No package version i

In [None]:
# Closing the driver (Important! that is what clogged my RAM after a few days of debugging)
driver.quit()

In [None]:
#print(vuln_pkgs)
print(len(vuln_pkgs))

In [None]:
# Writing into a json file

import json

with open('vul_pkgs_598.json', 'w') as f:
    json.dump(vuln_pkgs, f)