# Sampling algorithm
Implementation of Vertex sampling algorithm described in [_Web-scale information extraction with vertex_](https://ieeexplore.ieee.org/abstract/document/5767842)

In [1]:
import requests
from lxml import html
from collections import defaultdict

## Get all xpath

In [2]:
def get_all_xpath(html_src):
    
    # select nodes whose children include text nodes
    XPATH_SELECTOR = "//*[child::text()]" 
        
    root = html.fromstring(html_src)
    
    tree = root.getroottree()
    
    # leaf_nodes is not properly a list of all leaf nodes. 
    # It contains nodes which are parent of text elements in the DOM
    leaf_nodes = root.xpath(XPATH_SELECTOR)
    
    xpath_list = []
    
    # extract xpath from previously selected nodes and filter out "noisy" nodes
    for leaf in leaf_nodes:
        
        xpath = tree.getpath(leaf) + "/text()"
        
        # Filtering out xpaths which extract javascript code or css stylesheet
        if  "/script" not in xpath and "/noscript" not in xpath and "/style" not in xpath:
                        
            selected_values = root.xpath(xpath)
            selected_string = ''.join(selected_values).strip()
            
            # Filtering out xpaths which extract empty strings
            if selected_string:
                xpath_list.append(xpath)

    return xpath_list    

In [3]:
r = requests.get("https://www.androidworld.it/schede/redmi-7-2/")

In [4]:
xpath_list = get_all_xpath(r.content)

In [5]:
xpath_list

['/html/head/title/text()',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[1]/a/text()',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[2]/a/text()',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[3]/a/text()',
 '/html/body/div/div[3]/div/header/div/div[2]/span/a/text()',
 '/html/body/div/nav/div/ul/li[1]/a/text()',
 '/html/body/div/nav/div/ul/li[2]/a/text()',
 '/html/body/div/nav/div/ul/li[3]/a/text()',
 '/html/body/div/nav/div/ul/li[4]/a/text()',
 '/html/body/div/nav/div/ul/li[5]/a/text()',
 '/html/body/div/nav/div/ul/li[6]/a/text()',
 '/html/body/div/nav/div/ul/li[7]/a/text()',
 '/html/body/div/nav/div/ul/li[8]/a/text()',
 '/html/body/div/nav/div/ul/li[9]/a/text()',
 '/html/body/div/nav/div/ul/li[10]/a/text()',
 '/html/body/div/nav/div/ul/li[11]/a/text()',
 '/html/body/div/div[4]/div/main/div/section/article/figure/div/div[2]/a[2]/span[2]/text()',
 '/html/body/div/div[4]/div/main/div/section/article/figure/div/div[2]/a[3]/span[2]/text()',
 '/html/body/div/div[4]/div/m

In [6]:
len(xpath_list)

289

In [7]:
root = html.fromstring(r.content)
for xpath in xpath_list:
    list_of_values = root.xpath(xpath)
    string_value = ''.join(list_of_values)
    clean_value = string_value.strip()
    #print([string_value])
    print([clean_value])
    print()

['Redmi 7 (3GB) - Scheda tecnica | AndroidWorld']

['smart']

['mobile']

['android']

['AndroidWorld']

['Recensioni']

['Schede tecniche']

['Smartphone']

['Smartwatch']

['Tablet']

['App']

['Giochi']

['Guide']

['Video']

['Forum']

['📸']

['Specifiche']

['notizie']

['Correlati']

['Confronta']

['Redmi 7 (3GB)']

['Scheda Tecnica']

['Confronta']

['CPU']

['octa 1.8 GHz']

['Display']

['6,26" HD+ / 720 x 1520']

['Fotocamera']

['12 Mpx ƒ/2.2']

['Frontale']

['8 Mpx']

['RAM']

['3 GB']

['Memoria interna']

['32 / 64 GB']

['Android']

['9.0 Pie']

['Batteria']

['4000 mAh']

['è uno smartphone con sistema operativo Android di fascia bassa.']

['Redmi 7 (3GB)']

['La tecnologia del display dello smartphone è IPS LCD. Ha una diagonale di  ed una risoluzione di HD+ / 720 x 1520 e quindi un ppi di 269 ppi.']

['6,26 pollici']

['Abbiamo a che fare con una fotocamera a risoluzione di  con il supporto di un flash Singolo utile ad illuminare le foto in condizioni di scarsa luce

## Compute necessary data structures
### Utility functions
#### Get xpath value
Returns xpath selected value from a given page

In [8]:
def get_xpath_value(html_src, xpath):
    
    root = html.fromstring(html_src)
    selected_values = root.xpath(xpath)
    selected_string = ''.join(selected_values).strip()
    
    return selected_string

In [9]:
get_xpath_value(r.content, '/html/head/title/text()')

'Redmi 7 (3GB) - Scheda tecnica | AndroidWorld'

#### xpath to value
Given page source code _src_ and a list of xpath l returns a dict { _xpath_ : _value_ }, where _xpath_ is an xpath and _value_ is the string retrieved from the xpath on _src_

In [10]:
def xpath_to_value(html_src, xpath_list):
    
    result = {}
    
    for xpath in xpath_list:
        value = get_xpath_value(html_src, xpath)
        result.update({xpath: value})
        
    return result

In [11]:
xpath_to_value(r.content, ['/html/head/title/text()',
                            '/html/body/div/div[3]/div/header/div/div[1]/ul/li[1]/a/text()',
                            '/html/body/div/div[3]/div/header/div/div[1]/ul/li[2]/a/text()',
                            '/html/body/div/div[3]/div/header/div/div[1]/ul/li[3]/a/text()'])

{'/html/head/title/text()': 'Redmi 7 (3GB) - Scheda tecnica | AndroidWorld',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[1]/a/text()': 'smart',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[2]/a/text()': 'mobile',
 '/html/body/div/div[3]/div/header/div/div[1]/ul/li[3]/a/text()': 'android'}

#### get html
Given a list of URLs, returns a dictionary _url_, _html page_

In [12]:
def get_html(list_of_urls):
    
    result = {}
    
    for url in list_of_urls:
        r = requests.get(url)
        if r.ok:
            result[url] = r.content
            
    return result

#### get_data_structures
Return necessary data structures for computing xpaths weights

In [13]:
def get_data_structures(list_of_urls):
    
    html_pages = get_html(list_of_urls)
    
    url_to_xpaths = {}
    xpath_to_value_list = defaultdict(list)
    
    for url in list_of_urls:
        page = html_pages[url]
        xpath_list = get_all_xpath(page)
        
        url_to_xpaths[url] = xpath_list
        
        xpath_to_single_value = xpath_to_value(page, xpath_list)
        
        for xpath in xpath_to_single_value:
            value = xpath_to_single_value[xpath]
            xpath_to_value_list[xpath].append(value)
    
    return (url_to_xpaths, xpath_to_value_list)

In [14]:
bash_url_to_xpath_map, bash_xpath_to_values_map = get_data_structures(['http://www.tldp.org/LDP/abs/html/part1.html',
                    'http://www.tldp.org/LDP/abs/html/invoking.html'])

In [15]:
bash_url_to_xpath_map

{'http://www.tldp.org/LDP/abs/html/part1.html': ['/html/head/title/text()',
  '/html/body/div[1]/table/tr[1]/th/text()',
  '/html/body/div[1]/table/tr[2]/td[1]/a/text()',
  '/html/body/div[1]/table/tr[2]/td[3]/a/text()',
  '/html/body/div[2]/div/h1/text()',
  '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[1]/i/text()',
  '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[1]/i/em/text()',
  '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[2]/i/text()',
  '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[2]/i/em/text()',
  '/html/body/div[2]/div/div[1]/p[2]/text()',
  '/html/body/div[2]/div/div[1]/p[2]/i/text()',
  '/html/body/div[2]/div/div[1]/p[2]/span/text()',
  '/html/body/div[2]/div/div[2]/dl/dt[1]/b/text()',
  '/html/body/div[2]/div/div[2]/dl/dt[2]/text()',
  '/html/body/div[2]/div/div[2]/dl/dt[2]/a/text()',
  '/html/body/div[2]/div/div[2]/dl/dt[3]/text()',
  '/html/body/div[2]/div/div[2]/dl/dt[3]/a/text()',
  '/html/body/div[2]/div/div[2]/dl/dd/dl/dt[1]/text()',
  '/html/body/div[2]/div/di

In [16]:
bash_xpath_to_values_map

defaultdict(list,
            {'/html/head/title/text()': ['Introduction',
              'Invoking the script'],
             '/html/body/div[1]/table/tr[1]/th/text()': ['Advanced Bash-Scripting Guide:',
              'Advanced Bash-Scripting Guide:'],
             '/html/body/div[1]/table/tr[2]/td[1]/a/text()': ['Prev', 'Prev'],
             '/html/body/div[1]/table/tr[2]/td[3]/a/text()': ['Next', 'Next'],
             '/html/body/div[2]/div/h1/text()': ['Part 1. Introduction'],
             '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[1]/i/text()': ['Script:'],
             '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[1]/i/em/text()': ['A writing; a written\n        document. [Obs.]'],
             '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[2]/i/text()': ['--, 1913 ed.'],
             '/html/body/div[2]/div/div[1]/table/tr/td[2]/p[2]/i/em/text()': ["Webster's Dictionary"],
             '/html/body/div[2]/div/div[1]/p[2]/text()': ["The shell is a command interpreter. More than

## Compute weights

### Compute frequency
Given a list of values extracted from a xpath _Xi_ returns the frequency of _Xi_

In [17]:
def compute_frequency(values_list):
    return len(values_list)

In [18]:
selected_xpaths = ['/html/head/title/text()', 
                   '/html/body/div[1]/table/tr[1]/th/text()', 
                   '/html/body/div[2]/p[4]/b[2]/text()']

compute_frequency(bash_xpath_to_values_map[selected_xpaths[0]]) #should be 2

2

### Compute informativeness
Given cluster size and a list of values extracted from a xpath _Xi_ returns the informativeness of _Xi_

In [19]:
def compute_informativeness(M, values_list):

    values_set = set(values_list)
    Ti = len(values_set)
    
    sum_F_Xi = compute_frequency(values_list)

    return 1 - sum_F_Xi/(M*Ti)
    

In [20]:
compute_informativeness(10, [1,2,1,1,3,5,5]) #expected: 0.825

0.825

### xpath weight
Given a list of values extracted from a xpath _Xi_ returns the weight of _Xi_

In [21]:
def xpath_weight(cluster_size, list_of_values):
    return compute_frequency(list_of_values)*compute_informativeness(cluster_size, list_of_values)

In [22]:
xpath_weight(2, bash_xpath_to_values_map[selected_xpaths[0]]) #should be 1

1.0

In [23]:
xpath_weight(2, bash_xpath_to_values_map[selected_xpaths[1]]) #should be 0

0.0

In [24]:
xpath_weight(2, bash_xpath_to_values_map[selected_xpaths[2]]) #should be 0.5

0.5

### page_weight
Arguments:
- **list of xpath**: list of xpath of a given page
- **xpath_to_values_map**: dictionary where keys are xpath and values are values retrieved from the xpath
- **cluster_size**
- **intersection** (optional): if None nothing happens. Otherwise only xpath in **list of xpath** $\cap$ **intersection** will be considered in computing weight

In [50]:
def page_weight(list_of_xpath, xpath_to_values_map, cluster_size, intersection = None):

    weight = 0
    
    if intersection is None:
        intersection = list_of_xpath
        
    for xpath in list_of_xpath:
        if xpath in intersection:
            list_of_values = xpath_to_values_map[xpath]
            weight_of_xpath = xpath_weight(cluster_size, list_of_values)
            weight += weight_of_xpath
    return weight

In [51]:
page_weight(selected_xpaths, bash_xpath_to_values_map, 2) # should be 1.5

1.5

In [52]:
page_weight(selected_xpaths[:2], bash_xpath_to_values_map, 2) # should be 1

1.0

In [53]:
page_weight([selected_xpaths[0], selected_xpaths[2]], bash_xpath_to_values_map, 2) # should be 1.5

1.5

In [54]:
page_weight(selected_xpaths, bash_xpath_to_values_map, 2, selected_xpaths) #should be 1.5

1.5

In [56]:
page_weight(selected_xpaths, bash_xpath_to_values_map, 2, []) #should be 0

0

In [57]:
page_weight(selected_xpaths, bash_xpath_to_values_map, 2, selected_xpaths[:2]) #should be 1

1.0

In [58]:
page_weight(selected_xpaths, bash_xpath_to_values_map, 2, [selected_xpaths[2]]) #should be 0.5

0.5

In [59]:
selected_urls = ['http://www.tldp.org/LDP/abs/html/part1.html',
                'http://www.tldp.org/LDP/abs/html/invoking.html']

In [60]:
page_weight(bash_url_to_xpath_map[selected_urls[0]], bash_xpath_to_values_map, 2)

11.5

In [61]:
page_weight(bash_url_to_xpath_map[selected_urls[1]], bash_xpath_to_values_map, 2)

25.0

### Max weight page
Arguments:
- **url_to_xpaths_map**: dictionary where keys are urls and values are xpaths extracted from urls
- **xpath_to_values_map**: dictionary where keys are xpaths and values are values retrieved from the xpath
- **cluster_size**
- **intersection** (optional): if None nothing happens. Otherwise only xpath in **list of xpath** $\cap$ **intersection** will be considered in computing weight

In [62]:
def max_weight_page(url_to_xpaths_map, xpath_to_values_map, cluster_size, intersection = None):
    
    max_weight = 0
    max_weight_page = None
    
    for url in url_to_xpaths_map:
        
        xpaths = url_to_xpaths_map[url]
        weight = page_weight(xpaths, xpath_to_values_map, cluster_size, intersection)
        
        if weight > max_weight:
            max_weight = weight
            max_weight_page = url
    
    return url

In [63]:
max_weight_page(bash_url_to_xpath_map, bash_xpath_to_values_map, 2) #expected: invoking.html

'http://www.tldp.org/LDP/abs/html/invoking.html'

## Sampling algorithm

In [69]:
def sampling(list_of_urls, k = 20):
    
    cluster_size = len(list_of_urls)
    
    url_to_xpaths_map, xpath_to_values_map = get_data_structures(list_of_urls)
    
    X = list(xpath_to_values_map) #insert dictionary keys into a list
    result = []
    
    while X and len(result) <= k:
        max_weight_url = max_weight_page(url_to_xpaths_map, xpath_to_values_map, cluster_size, X)
        result.append(max_weight_url)
        X = [xpath for xpath in X if xpath not in url_to_xpaths_map[max_weight_url]]
        url_to_xpaths_map.pop(max_weight_url)
        
    return result

In [70]:
sampling(selected_urls)

['http://www.tldp.org/LDP/abs/html/invoking.html',
 'http://www.tldp.org/LDP/abs/html/part1.html']