In [112]:
import os
import random
import re
import sys
import copy
import math

DAMPING = 0.85
SAMPLES = 10000


def main():
    #if len(sys.argv) != 2:
    #    sys.exit("Usage: python pagerank.py corpus")
    #corpus = crawl(sys.argv[1])
    corpus = crawl("corpus2")
    ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
    print(f"PageRank Results from Sampling (n = {SAMPLES})")
    soma=0
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")
        soma+=ranks[page]
    print (F"soma={soma}")
    soma=0
    ranks = iterate_pagerank(corpus, DAMPING)
    print(f"PageRank Results from Iteration")
    for page in sorted(ranks):
        print(f"  {page}: {ranks[page]:.4f}")
        soma+=ranks[page]
    print (F"soma={soma}")


def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}

    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )

    return pages


def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.

    With probability `damping_factor`, choose a link at random
    linked to by `page`. With probability `1 - damping_factor`, choose
    a link at random chosen from all pages in the corpus.
    """

    # CG: get the list of links for page given in 'page':
    set_of_links = corpus.get(page, "")

    # CG: compute damped probability for links to external pages:
    try:
        damped_probability = damping_factor / len(set_of_links)
    except:
        # CG: if a page has no links, we can pretend it has links to all pages in the corpus, including itself:
        damped_probability = 1 / len(set(corpus.keys()))

    # CG: get the list of all pages in corpus:
    set_of_pages = list(corpus.keys())

    # CG: compute probability of clicking in any link, including a link to same page:
    try:
        remain_probability = (1 - damping_factor) / len(set_of_pages)
    except:
        print ("Given corpus has no pages to process!")
        raise ZeroDivisionError

    # CG: prepare to get the resulting dict:
    resulting_dict = dict()

    # CG: iterate thru the list of pages in the corpus:
    for apage in set_of_pages:        
        # CG: if the page is in the list of links:
        if apage in set_of_links:
            # CG: With probability damping_factor, the random surfer should randomly choose one of the links from page with equal probability.
            resulting_dict.update({apage: remain_probability + damped_probability})
        else:
            # CG: With probability 1 - damping_factor, the random surfer should randomly choose one of all pages in the corpus with equal probability.
            resulting_dict.update({apage: remain_probability})

    # CG: return the resulting dictionary:
    return resulting_dict



def sample_pagerank(corpus, damping_factor, n):
    """
    Return PageRank values for each page by sampling `n` pages
    according to transition model, starting with a page at random.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """

    # CG: make sure n is valid:
    if n < 1.0:
        print ("Invalid parameter, 'n' = zero!")
        raise ValueError

    # CG: make sure damping_factor is valid:
    if not 0 < damping_factor < 1.0:
        print ("Invalid parameter, 'damping_factor' = zero!")
        raise ValueError

    # CG: initialize dictionaries:
    pages_sampled = dict()
    resulting_dict = dict()

    # CG: initialize dictionary with keys for each pages and an empty set for the turns of times sampled:
    for apage in list(corpus.keys()):
        pages_sampled.update({apage:[]})

    # CG: randomly select first page:
    current_page = random.choices(list(corpus.keys()))[0]

    # CG: keep count number of samples:
    count = n

    # CG: sampling loop:
    while count:
        # CG: call the transition model for a page:
        result_dict = transition_model(corpus, current_page, damping_factor)
        # CG: updates samples ran for page:
        pages_sampled[current_page].append(count)
        # CG: chooses new page based on the results of last page:
        current_page = random.choices(list(result_dict.keys()), weights=list(result_dict.values()))[0]
        # CG: decrement count:
        count -= 1

    # CG: compute resulting dictionary:
    for apage in list(pages_sampled.keys()):
        resulting_dict.update({apage:len(pages_sampled[apage])/n})

    # CG: compute total probability found:
    s=sum(resulting_dict[p] for p in resulting_dict)

    # CG: adjust probabilities to sum up to 1.0:
    for p in resulting_dict:
        resulting_dict[p] = 1.0 / s * resulting_dict[p]

    # CG: return the resulting dictionay:
    return resulting_dict


def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.
    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """

    # CG: let's use a copy of the corpus to work with:
    mycorpus = copy.deepcopy(corpus)

    # CG: create auxiliary sets:
    PR = dict()
    oPR = dict()
    links_to_page = dict()
    list_of_pages=list(mycorpus.keys())
    list_of_pages_done=[]

    # CG: loop over all pages in the corpus:
    for apage in mycorpus:

        # CG: compute initial probability:
        PR[apage] = 1 / len(mycorpus)

        # CG: if a page has no links...
        if len(mycorpus[apage]) == 0:

            # CG: ...assume it has links to all pages:
            mycorpus[apage] = set(list(mycorpus.keys()))

        # CG: create an empty entry for each page in the set of links:
        links_to_page[apage] = set()

    # CG: let's build a set of all pages that link to a particular page:
    for apage in mycorpus:

        # CG: loops over all links of each page:
        for link in mycorpus[apage]:

            # CG: add page to set of links:
            links_to_page[link].add(apage)

    # CG: controls main loop:
    seek_convergence = True

    iters=0

    # CG: keeps the loop until all pages' probabilities converge:
    while list_of_pages_done != list_of_pages:

        # CG: keep track of old values for future comparison:
        oPR = copy.deepcopy(PR)

        # CG: loops over all pages in corpus:
        for apage in mycorpus:

            # CG: initializes sum of links probability:
            sum_of_links_prob = 0

            # CG: Loops over all links of a page and computes sum of each link's probabilities:
            for link in links_to_page[apage]:
                sum_of_links_prob = sum_of_links_prob + PR[link] / len(mycorpus[link])

            # CG: compute page probability:
            PR[apage] = (1 - damping_factor) / len(mycorpus) + (damping_factor * sum_of_links_prob)

            # CG: compare current prob to previous to check if value changes by more than 0.001:
            if abs(PR[apage] - oPR[apage]) <= 0.001:

                # CG: check if page already is in the list of pages done:
                if apage not in list_of_pages_done:

                    # CG: add the page to the list:
                    list_of_pages_done.append(apage)

                    # CG: order the list:
                    list_of_pages_done.sort()

            iters += 1

    # CG: compute total probability found:
    s=sum(PR[p] for p in PR)

    # CG: adjust probabilities to sum up to 1.0:
    for p in PR:
        PR[p] = 1.0 / s * PR[p]
    return PR


if __name__ == "__main__":
    main()


PageRank Results from Sampling (n = 10000)
  ai.html: 0.1968
  algorithms.html: 0.1090
  c.html: 0.1211
  inference.html: 0.1344
  logic.html: 0.0266
  programming.html: 0.2210
  python.html: 0.1169
  recursion.html: 0.0742
soma=1.0
PageRank Results from Iteration
  ai.html: 0.1888
  algorithms.html: 0.1065
  c.html: 0.1243
  inference.html: 0.1288
  logic.html: 0.0262
  programming.html: 0.2300
  python.html: 0.1240
  recursion.html: 0.0715
soma=1.0
