# Setup

In [None]:
# get credentials on https://www.kaggle.com/docs/api
!mkdir /root/.kaggle/
! echo '{"username":"<kaggle_username>","key":"<Kaggle_api_token>"}' > /root/.kaggle/kaggle.json

In [None]:
!pip install selectolax
!pip install networkx

In [None]:
import pandas as pd
import json
import kaggle
from tqdm import tqdm
from time import time
from selectolax.parser import HTMLParser
import networkx as nx

# Download Collection

In [3]:
kaggle.api.dataset_download_files('html-br-collection', path='/content/', unzip=True, quiet=False)

  0%|          | 5.00M/16.0G [00:00<06:27, 44.3MB/s]

Downloading html-br-collection.zip to /content


100%|██████████| 16.0G/16.0G [03:11<00:00, 89.5MB/s]





# Parser Methods


In [4]:
def get_text_selectolax(html):
    """
    a fast HTML parser.
    :param html: html content
    :return: the number of words on parsed html.
    """
    tree = HTMLParser(html)

    if tree.body is None:
        return -1

    for tag in tree.css('script'):
        tag.decompose()
    for tag in tree.css('style'):
        tag.decompose()

    text = tree.body.text(separator=' ')

    return len(text.split())


def get_url_size(url):
    """
    Gets the size of url.
    :param url: 
    :return: 
    """
    return len(url.split("/")) - 2

def extract_urls(html_content):
    urls = []
    dom = HTMLParser(html_content)
    for tag in dom.tags('a'):
        attrs = tag.attributes
        if 'href' in attrs:
            urls.append(attrs['href'])
    return urls

# Page Rank

In [5]:
# create collection graph G(V,E)
g = nx.DiGraph()
with open("/content/collection.jl", "r") as collection_file:
    for idx, line in enumerate(tqdm(collection_file, total=1000068)):
        data = json.loads(line)
        g.add_node(data["url"])


100%|██████████| 1000068/1000068 [18:29<00:00, 900.98it/s]


In [6]:
def update_graph(u,v):
    if g.has_node(v):
        if g.has_edge(u,v):
            g[u][v]["weight"]+=1
        else:
            g.add_edge(u, v, weight=1)


with open("/content/collection.jl", "r") as collection_file:
    for idx, line in enumerate(tqdm(collection_file, total=1000068)):
        data = json.loads(line)
        source_url = data["url"]
        urls = extract_urls(data["html_content"])
        for target_url in urls:
            update_graph(source_url,target_url)



100%|██████████| 1000068/1000068 [50:02<00:00, 333.10it/s]


In [None]:
pr = nx.pagerank(g)

In [None]:
prs = []
for url, pagerank in pr.items():
    prs.append(
        {
            "url":url,
            "pagerank":pagerank
        }
   
    )

In [None]:
pr_df = pd.DataFrame(prs)
pr_df