In [4]:
import os
import random
import re
import sys

def crawl(directory):
    """
    Parse a directory of HTML pages and check for links to other pages.
    Return a dictionary where each key is a page, and values are
    a list of all other pages in the corpus that are linked to by the page.
    """
    pages = dict()

    # Extract all links from HTML files
    for filename in os.listdir(directory):
        if not filename.endswith(".html"):
            continue
        with open(os.path.join(directory, filename)) as f:
            contents = f.read()
            links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
            pages[filename] = set(links) - {filename}
    # Only include links to other pages in the corpus
    for filename in pages:
        pages[filename] = set(
            link for link in pages[filename]
            if link in pages
        )
    return pages

In [65]:
def transition_model(corpus, page, damping_factor):
    """
    Return a probability distribution over which page to visit next,
    given a current page.

    With probability `damping_factor`, choose a link at random
    linked to by `page`. With probability `1 - damping_factor`, choose
    a link at random chosen from all pages in the corpus.
    """
    model = dict()
    num_pages = len(corpus)
    num_links = len(corpus[page])
    if num_links == 0:
        for corpus_page in corpus:
            model[corpus_page] = round(1/num_pages,5)
    else:
        for corpus_page in corpus:
            model[corpus_page] = round((1-damping_factor)/num_pages,5)
    for page_link in corpus[page]:
        model[page_link] += round(damping_factor/num_links,5)
    return model

In [214]:
def sample_pagerank(corpus, damping_factor, n):
    """
    Return PageRank values for each page by sampling `n` pages
    according to transition model, starting with a page at random.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    ranks = dict()
    samples = []
    samples.append(random.choice(list(corpus)))
    sampler = 1
    while sampler < n:
        sampler += 1
        model = transition_model(corpus, samples[-1], damping_factor)
        samples.append(random.choices(list(model),model.values())[0])
    for corpus_page in corpus:
        ranks[corpus_page] = round(samples.count(corpus_page)/n,5)
    return ranks

In [324]:
def iterate_pagerank(corpus, damping_factor):
    """
    Return PageRank values for each page by iteratively updating
    PageRank values until convergence.

    Return a dictionary where keys are page names, and values are
    their estimated PageRank value (a value between 0 and 1). All
    PageRank values should sum to 1.
    """
    ranks = dict()
    up_ranks = dict()
    num_pages = len(corpus)
    for corpus_page in corpus:
        ranks[corpus_page] = round(1/num_pages,5)
    while True:
        for p in corpus:
            up_ranks[p] = round((1-damping_factor)/num_pages,5)
            for i in corpus:
                if p in corpus[i] and len(corpus[i]) != 0:
                    up_ranks[p] += round((damping_factor * ranks[i])/len(corpus[i]),5)
                elif len(corpus[i]) == 0:
                    up_ranks[p] += round((damping_factor * ranks[i])/num_pages,5)
        if all([i-j <= 0.001 for i,j in zip(ranks.values(),up_ranks.values())]):
            ranks = up_ranks.copy()
            break
        else:
            ranks = up_ranks.copy()
    return ranks


In [325]:
test_corpus = {"1.html": {"2.html", "3.html"}, "2.html": {"3.html"}, "3.html": {"2.html"}}
test_damping = 0.85
test_page = "1.html"
test_sample = iterate_pagerank(test_corpus,test_damping)

In [326]:
test_sample

{'1.html': 0.05, '2.html': 0.475, '3.html': 0.475}

In [80]:
random.choice(list(test_corpus))

'2.html'

In [49]:
round(45.23,5)

45.23

In [54]:
len(test_corpus[test_page])

2

In [81]:
list(test_corpus)

['1.html', '2.html', '3.html']

In [89]:
samples =["a","b", "c"]
samples[-1]

'c'

In [108]:
test_dic

{'1.html': 0.05, '2.html': 0.475, '3.html': 0.475}

In [163]:
random.choices(list(test_dic.keys()), test_dic.values())

['3.html']

In [164]:
test_corpus = {"1.html": {"2.html", "3.html"}, "2.html": {"3.html"}, "3.html": {"2.html"}}
test_damping = 0.85
test_page = "1.html"


NameError: name 'count' is not defined

In [182]:
samples.append(random.choices(list(model),model.values())[0])

In [265]:
print(test_corpus)
print(test_sample)


{'1.html': {'2.html', '3.html'}, '2.html': {'3.html'}, '3.html': {'2.html'}}
{'1.html': 0.33333, '2.html': 0.33333, '3.html': 0.33333}


In [266]:
ranks = dict()
num_pages = len(test_corpus)
for corpus_page in test_corpus:
    ranks[corpus_page] = round(1/num_pages,5)

In [277]:
all([i-j <= 0.001 for i,j in zip(ranks.values(),ranks.values())])

True

In [258]:
[(value>value2) for value in a for value2 in a]

[False, False, True, False]

In [250]:
1,2 in a,a

(1, False, {'1': 1, '2': 2})

In [263]:
[print(i) for i in zip(a,a)]

('1', '1')
('2', '2')


[None, None]

In [281]:
test_sample = test_sample

In [282]:
test_sample2 = test_sample

In [286]:
test_sample2["1.html"] = 40

In [292]:
test_sample

{'1.html': 40, '2.html': 0.33333, '3.html': 0.33333}

In [288]:
test_sample3 = test_sample.copy()

In [291]:
test_sample3

{'1.html': 20, '2.html': 0.33333, '3.html': 0.33333}

In [302]:
test = test_corpus.copy()

In [305]:
len(test["1.html"])

2