<a href="https://colab.research.google.com/github/lucapas/VERTEX/blob/master/sampling_algorithm_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sampling algorithm
Implementation of Vertex sampling algorithm described in [_Web-scale information extraction with vertex_](https://ieeexplore.ieee.org/abstract/document/5767842)

In [0]:
import requests
from lxml import html
from collections import defaultdict

## Get all xpath

In [0]:
def get_all_xpath(html_src):
    
    # select nodes whose children include text nodes
    XPATH_SELECTOR = "//*[child::text()]" 
        
    root = html.fromstring(html_src)
    
    tree = root.getroottree()
    
    # leaf_nodes is not properly a list of all leaf nodes. 
    # It contains nodes which are parent of text elements in the DOM
    leaf_nodes = root.xpath(XPATH_SELECTOR)
    
    xpath_list = []
    
    # extract xpath from previously selected nodes and filter out "noisy" nodes
    for leaf in leaf_nodes:
        
        xpath = tree.getpath(leaf) + "/text()"
        
        # Filtering out xpaths which extract javascript code or css stylesheet
        if  "/script" not in xpath and "/noscript" not in xpath and "/style" not in xpath:
                        
            selected_values = root.xpath(xpath)
            selected_string = ''.join(selected_values).strip()
            
            # Filtering out xpaths which extract empty strings
            if selected_string:
                xpath_list.append(xpath)

    return xpath_list    

## Compute necessary data structures
### Utility functions
#### Get xpath value
Returns xpath selected value from a given page

In [0]:
def get_xpath_value(html_src, xpath):
    
    root = html.fromstring(html_src)
    selected_values = root.xpath(xpath)
    selected_string = ''.join(selected_values).strip()
    
    return selected_string

#### xpath to value
Given page source code _src_ and a list of xpath l returns a dict { _xpath_ : _value_ }, where _xpath_ is an xpath and _value_ is the string retrieved from the xpath on _src_

In [0]:
def xpath_to_value(html_src, xpath_list):
    
    result = {}
    
    for xpath in xpath_list:
        value = get_xpath_value(html_src, xpath)
        result.update({xpath: value})
        
    return result

#### get html
Given a list of URLs, returns a dictionary _url_, _html page_

In [0]:
def get_html(list_of_urls):
    
    result = {}
    
    for url in list_of_urls:
        r = requests.get(url)
        if r.ok:
            result[url] = r.content
            
    return result

#### get_data_structures
Return necessary data structures for computing xpaths weights

In [0]:
def get_data_structures(list_of_urls):
    
    html_pages = get_html(list_of_urls)
    
    url_to_xpaths = {}
    xpath_to_value_list = defaultdict(list)
    
    for url in list_of_urls:
        page = html_pages[url]
        xpath_list = get_all_xpath(page)
        
        url_to_xpaths[url] = xpath_list
        
        xpath_to_single_value = xpath_to_value(page, xpath_list)
        
        for xpath in xpath_to_single_value:
            value = xpath_to_single_value[xpath]
            xpath_to_value_list[xpath].append(value)
    
    return (url_to_xpaths, xpath_to_value_list)

## Compute weights

### Compute frequency
Given a list of values extracted from a xpath _Xi_ returns the frequency of _Xi_

In [0]:
def compute_frequency(values_list):
    return len(values_list)

### Compute informativeness
Given cluster size and a list of values extracted from a xpath _Xi_ returns the informativeness of _Xi_

In [0]:
def compute_informativeness(M, values_list):

    values_set = set(values_list)
    Ti = len(values_set)
    
    sum_F_Xi = compute_frequency(values_list)

    return 1 - sum_F_Xi/(M*Ti)
    

### xpath weight
Given a list of values extracted from a xpath _Xi_ returns the weight of _Xi_

In [0]:
def xpath_weight(cluster_size, list_of_values):
    return compute_frequency(list_of_values)*compute_informativeness(cluster_size, list_of_values)

### calcola_pesi
Arguments:
- **xpath_to_values_map**: list of xpath of a given page
- **list_of_urls**: list of uri


In [0]:
def F(xpath,second_data):
  return len(second_data.get(xpath))  
   

def I(xpath,second_data,list_of_urls):
  return 1 - ( F(xpath,second_data) / ( len(list_of_urls) * len(set(second_data.get(xpath))) ) )

def w(xpath,second_data,list_of_urls):
  return F(xpath,second_data)*I(xpath,second_data,list_of_urls)



def calcola_pesi(xpath_to_values_map,list_of_urls):
  w_list_xpath={}
  for xpath in xpath_to_values_map:
    w_list_xpath.update({xpath: w(xpath,xpath_to_values_map,list_of_urls)})
  return w_list_xpath
  


### Max weight page
Arguments:
- **list_of_urls**: dictionary where keys are urls and values are xpaths extracted from urls
- **xpath_to_values_map**: dictionary where keys are xpaths and values are values retrieved from the xpath
- **list_xpath_weight**: xpath weight's


In [0]:
def max_weight_page(list_of_urls, xpath_to_values_map, list_xpath_weight):
  massimo_url=''
  massimo=0

  for uri in list_of_urls:
    peso_pagina=0
    for xpath in xpath_to_values_map[uri]:
      w=list_xpath_weight.get(xpath)
      if bool(w):
        peso_pagina=peso_pagina+w 
        
    if massimo<peso_pagina:
      massimo=peso_pagina
      massimo_url=uri
      
  return massimo_url


In [0]:
def del_xpath(uri,list_xpath_weight,url_to_xpaths_map):
  for xpath in url_to_xpaths_map[uri]:
    if bool(list_xpath_weight.get(xpath)):
      list_xpath_weight.pop(xpath)

In [0]:
def page_weight(list_of_xpath, xpath_to_values_map, cluster_size, intersection = None):

    weight = 0
    
    if intersection is None:
        intersection = list_of_xpath
        
    for xpath in list_of_xpath:
        if xpath in intersection:
            list_of_values = xpath_to_values_map[xpath]
            weight_of_xpath = xpath_weight(cluster_size, list_of_values)
            weight += weight_of_xpath
    return weight

In [0]:
def coverage(X, sample_pages_urls, cluster_pages_urls, url_to_xpaths_map, xpath_to_values_map):
    covered = 0
    cluster_size = len(cluster_pages_urls)
    for url in cluster_pages_urls:
        if url not in sample_pages_urls:
            xpaths = url_to_xpaths_map[url]
            weight = page_weight(xpaths, xpath_to_values_map, cluster_size, X)
            if weight == 0:
                covered = covered + 1
    
    return (covered + len(sample_pages_urls))/cluster_size

## Sampling algorithm

In [0]:
def sampling(list_of_urls, k = 20):

    cluster_size = len(list_of_urls)
    url_to_xpaths_map, xpath_to_values_map = get_data_structures(list_of_urls)

    list_xpath_weight = calcola_pesi(xpath_to_values_map,list_of_urls)

    #X = list(xpath_to_values_map) #insert dictionary keys into a list
    result = []
    covered_page=0
    
    while list_xpath_weight and len(result) <= k:
        max_weight_url = max_weight_page(list_of_urls, url_to_xpaths_map, list_xpath_weight)
        result.append(max_weight_url)
        del_xpath(max_weight_url,list_xpath_weight,url_to_xpaths_map)
        
        coverage_value = coverage(list_xpath_weight, result, list_of_urls, url_to_xpaths_map, xpath_to_values_map)
        print(coverage_value)
    return result,coverage_value

In [0]:
#def sampling(url_to_html_map, k = 20):
#    
#   cluster_size = len(url_to_html_map)
#   
#   url_to_xpaths_map, xpath_to_values_map = get_data_structures(url_to_html_map)
#    
#    X = list(xpath_to_values_map) #insert dictionary keys into a list
#    result = []
#    
#    iteration_no = 1
#    
#    while X and len(result) <= k:
#        max_weight_url = max_weight_page(url_to_xpaths_map, xpath_to_values_map, cluster_size, X)
#        result.append(max_weight_url)
#        X = [xpath for xpath in X if xpath not in url_to_xpaths_map[max_weight_url]]
#        url_to_xpaths_map.pop(max_weight_url)
#        print("-------------------")
#        print("Iteration {}".format(iteration_no))
#        coverage_value = coverage(X, result, list(url_to_html_map), url_to_xpaths_map, xpath_to_values_map)
#        print("Coverage is {}".format(coverage_value))
#        print("-------------------")
#        print("-------------------")
#        iteration_no = iteration_no +1


#    return result,coverage_value

In [0]:
#mettere k=1 se no non funziona
list_uri=["https://www.androidworld.it/schede/redmi-7-2/", "https://www.androidworld.it/schede/samsung-galaxy-a70/"]
#sampling(list_uri)

In [18]:
from lxml import html
import requests
url = "http://www.europarl.europa.eu/news/en/press-room/page/"
list_of_links = []
for page in range(2):
    r = requests.get(url + str(page))
    source = r.content
    page_source = html.fromstring(source)
    list_of_links.extend(page_source.xpath('//a[@title="Read more"]/@href'))
print(list_of_links)

['http://www.europarl.europa.eu/news/en/press-room/20190404IPR35103/eu-member-states-test-cybersecurity-preparedness-for-free-and-fair-eu-elections', 'http://www.europarl.europa.eu/news/en/press-room/20190405IPR35201/the-european-parliament-launches-a-website-on-european-election-results', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34671/mobility-package-parliament-adopts-position-on-overhaul-of-road-transport-rules', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34670/meps-adopted-measures-to-reconcile-work-and-family-life', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34682/meps-back-first-eu-management-plan-for-fish-stocks-in-the-western-mediterranean', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34683/schengen-meps-adopt-their-position-on-temporary-checks-at-national-borders', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34673/natural-gas-parliament-extends-eu-rules-to-pipelines-from-non-eu-countries'

In [19]:
sampling(list_of_links)

0.03333333333333333
0.1
0.13333333333333333
0.26666666666666666
0.3
0.3333333333333333
0.36666666666666664
0.4
0.5333333333333333
0.6
0.6333333333333333
0.6666666666666666
0.7
0.7333333333333333
0.7666666666666667
0.8
0.8666666666666667
0.9
0.9333333333333333
0.9666666666666667
1.0


(['http://www.europarl.europa.eu/news/en/press-room/20190321IPR32114/meps-approve-eu-s-spending-in-2017',
  'http://www.europarl.europa.eu/news/en/press-room/20190321IPR32118/venezuela-meps-demand-free-presidential-elections-and-an-end-to-repression',
  'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34671/mobility-package-parliament-adopts-position-on-overhaul-of-road-transport-rules',
  'http://www.europarl.europa.eu/news/en/press-room/20190111IPR23225/questions-and-answers-on-issues-about-the-digital-copyright-directive',
  'http://www.europarl.europa.eu/news/en/press-room/20190405IPR35201/the-european-parliament-launches-a-website-on-european-election-results',
  'http://www.europarl.europa.eu/news/en/press-room/20190401IPR34530/fairer-simpler-more-flexible-eu-farm-policy-meps-vote-on-post-2020-reform',
  'http://www.europarl.europa.eu/news/en/press-room/20190321IPR32113/mobility-package-postponed',
  'http://www.europarl.europa.eu/news/en/press-room/20190321IPR32111/p

In [0]:
len(list_of_links)

In [20]:
from lxml import html
import requests
url = "http://www.europarl.europa.eu/news/en/press-room/page/"
list_of_links = []
for page in range(20):
    r = requests.get(url + str(page))
    source = r.content
    page_source = html.fromstring(source)
    list_of_links.extend(page_source.xpath('//a[@title="Read more"]/@href'))
print(list_of_links)

['http://www.europarl.europa.eu/news/en/press-room/20190404IPR35103/eu-member-states-test-cybersecurity-preparedness-for-free-and-fair-eu-elections', 'http://www.europarl.europa.eu/news/en/press-room/20190405IPR35201/the-european-parliament-launches-a-website-on-european-election-results', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34671/mobility-package-parliament-adopts-position-on-overhaul-of-road-transport-rules', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34670/meps-adopted-measures-to-reconcile-work-and-family-life', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34682/meps-back-first-eu-management-plan-for-fish-stocks-in-the-western-mediterranean', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34683/schengen-meps-adopt-their-position-on-temporary-checks-at-national-borders', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34673/natural-gas-parliament-extends-eu-rules-to-pipelines-from-non-eu-countries'

In [21]:
sampling(list_of_links)

0.0033333333333333335
0.016666666666666666
0.023333333333333334
0.03333333333333333
0.05
0.05333333333333334
0.056666666666666664
0.1
0.18666666666666668
0.21666666666666667
0.28
0.2866666666666667
0.33666666666666667
0.3566666666666667
0.36333333333333334
0.37333333333333335
0.39
0.3933333333333333
0.4066666666666667
0.41
0.43333333333333335


(['http://www.europarl.europa.eu/news/en/press-room/20190321IPR32135/new-rules-to-help-consumers-join-forces-to-seek-compensation',
  'http://www.europarl.europa.eu/news/en/press-room/20190307IPR30738/uk-must-make-clear-what-it-wants-meps-say-in-brexit-debate',
  'http://www.europarl.europa.eu/news/en/press-room/20190220IPR27656/safer-roads-more-life-saving-technology-to-be-mandatory-in-vehicles',
  'http://www.europarl.europa.eu/news/en/press-room/20181116IPR19218/european-parliament-to-host-human-rights-week',
  'http://www.europarl.europa.eu/news/en/press-room/20181205IPR20934/meps-want-to-fund-crucial-areas-to-stimulate-european-growth',
  'http://www.europarl.europa.eu/news/en/press-room/20181017BKG16357/european-parliament-press-kit-for-the-european-council-of-17-and-18-october-2018',
  'http://www.europarl.europa.eu/news/en/press-room/20190121IPR23915/consumers-rights-against-defective-digital-content-agreed-by-eu-lawmakers',
  'http://www.europarl.europa.eu/news/en/press-room/2

In [22]:
len(list_of_links)

300

In [24]:
from lxml import html
import requests
url = "http://www.europarl.europa.eu/news/en/press-room/page/"
list_of_links = []
for page in range(50):
    r = requests.get(url + str(page))
    source = r.content
    page_source = html.fromstring(source)
    list_of_links.extend(page_source.xpath('//a[@title="Read more"]/@href'))
print(list_of_links)

['http://www.europarl.europa.eu/news/en/press-room/20190404IPR35103/eu-member-states-test-cybersecurity-preparedness-for-free-and-fair-eu-elections', 'http://www.europarl.europa.eu/news/en/press-room/20190405IPR35201/the-european-parliament-launches-a-website-on-european-election-results', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34671/mobility-package-parliament-adopts-position-on-overhaul-of-road-transport-rules', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34670/meps-adopted-measures-to-reconcile-work-and-family-life', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34682/meps-back-first-eu-management-plan-for-fish-stocks-in-the-western-mediterranean', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34683/schengen-meps-adopt-their-position-on-temporary-checks-at-national-borders', 'http://www.europarl.europa.eu/news/en/press-room/20190402IPR34673/natural-gas-parliament-extends-eu-rules-to-pipelines-from-non-eu-countries'

In [25]:
sampling(list_of_links)

0.0026666666666666666
0.005333333333333333
0.012
0.02
0.021333333333333333
0.02266666666666667
0.036
0.037333333333333336
0.08266666666666667
0.10933333333333334
0.13333333333333333
0.18266666666666667
0.18933333333333333
0.19733333333333333
0.21866666666666668
0.22533333333333333
0.24
0.252
0.25733333333333336
0.284
0.2946666666666667


(['http://www.europarl.europa.eu/news/en/press-room/20180607IPR05244/parliament-votes-for-EU1-billion-in-aid-to-ukraine',
  'http://www.europarl.europa.eu/news/en/press-room/20171127IPR88936/eu-budget-2018-approved-support-for-youth-growth-security',
  'http://www.europarl.europa.eu/news/en/press-room/20180412IPR01606/meps-urge-facebook-ceo-to-come-to-european-parliament',
  'http://www.europarl.europa.eu/news/en/press-room/20190307IPR30738/uk-must-make-clear-what-it-wants-meps-say-in-brexit-debate',
  'http://www.europarl.europa.eu/news/en/press-room/20180214IPR97814/european-parliamentary-week-digital-economy-taxes-and-future-of-work',
  'http://www.europarl.europa.eu/news/en/press-room/20181017BKG16357/european-parliament-press-kit-for-the-european-council-of-17-and-18-october-2018',
  'http://www.europarl.europa.eu/news/en/press-room/20190111IPR23225/questions-and-answers-on-issues-about-the-digital-copyright-directive',
  'http://www.europarl.europa.eu/news/en/press-room/20180524I

In [26]:
len(list_of_links)

750