diff --git a/nlp.py b/nlp.py index 365d726c2..bd26d0a7b 100644 --- a/nlp.py +++ b/nlp.py @@ -356,13 +356,13 @@ def detect(self): def getInlinks(page): if not page.inlinks: page.inlinks = determineInlinks(page) - return [p for addr, p in pagesIndex.items() if addr in page.inlinks] + return [addr for addr, p in pagesIndex.items() if addr in page.inlinks] def getOutlinks(page): if not page.outlinks: page.outlinks = findOutlinks(page) - return [p for addr, p in pagesIndex.items() if addr in page.outlinks] + return [addr for addr, p in pagesIndex.items() if addr in page.outlinks] # ______________________________________________________________________________ @@ -389,9 +389,11 @@ def HITS(query): p.authority = 1 p.hub = 1 while True: # repeat until... convergence - for p in pages.values(): - p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub - p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority + authority = {p: pages[p].authority for p in pages} + hub = {p: pages[p].hub for p in pages} + for p in pages: + pages[p].authority = sum(hub[x] for x in getInlinks(pages[p])) # p.authority ← ∑i Inlinki(p).Hub + pages[p].hub = sum(authority[x] for x in getOutlinks(pages[p])) # p.hub ← ∑i Outlinki(p).Authority normalize(pages) if convergence(): break diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 3dc5a57aa..99803d025 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -2,7 +2,7 @@ import nlp from nlp import loadPageHTML, stripRawHTML, findOutlinks, onlyWikipediaURLS from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks -from nlp import getOutlinks, Page +from nlp import getOutlinks, Page, determineInlinks, HITS from nlp import Rules, Lexicon # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by function's within nlp.py @@ -61,9 +61,9 @@ def test_stripRawHTML(): def test_determineInlinks(): - # TODO - assert True - + assert set(determineInlinks(pA)) == set(['B', 'C', 'E']) + assert set(determineInlinks(pE)) == set([]) + assert set(determineInlinks(pF)) == set(['E']) def test_findOutlinks_wiki(): testPage = pageDict[pA.address] @@ -122,17 +122,20 @@ def test_detectConvergence(): def test_getInlinks(): inlnks = getInlinks(pageDict['A']) - assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks + assert sorted(inlnks) == pageDict['A'].inlinks def test_getOutlinks(): outlnks = getOutlinks(pageDict['A']) - assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks + assert sorted(outlnks) == pageDict['A'].outlinks def test_HITS(): - # TODO - assert True # leave for now + HITS('inherit') + auth_list = [pA.authority, pB.authority, pC.authority, pD.authority, pE.authority, pF.authority] + hub_list = [pA.hub, pB.hub, pC.hub, pD.hub, pE.hub, pF.hub] + assert max(auth_list) == pD.authority + assert max(hub_list) == pE.hub if __name__ == '__main__':