From 1663464025b19e66aad63bd55d2ebb888583076f Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Thu, 30 Mar 2017 01:37:56 +0530 Subject: [PATCH 1/4] Add test for determineInlinks() --- tests/test_nlp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 43f71f163..9f2f1244b 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -2,7 +2,7 @@ import nlp from nlp import loadPageHTML, stripRawHTML, findOutlinks, onlyWikipediaURLS from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks -from nlp import getOutlinks, Page +from nlp import getOutlinks, Page, determineInlinks from nlp import Rules, Lexicon # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by function's within nlp.py @@ -61,9 +61,9 @@ def test_stripRawHTML(): def test_determineInlinks(): - # TODO - assert True - + assert set(determineInlinks(pA)) == set(['B', 'C', 'E']) + assert set(determineInlinks(pE)) == set([]) + assert set(determineInlinks(pF)) == set(['E']) def test_findOutlinks_wiki(): testPage = pageDict[pA.address] From 038d4e10334df55f884311e7adbaa4985c17dbce Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Thu, 30 Mar 2017 01:39:40 +0530 Subject: [PATCH 2/4] Add test for HITS() --- tests/test_nlp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 9f2f1244b..ff79d3cb3 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -2,7 +2,7 @@ import nlp from nlp import loadPageHTML, stripRawHTML, findOutlinks, onlyWikipediaURLS from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks -from nlp import getOutlinks, Page, determineInlinks +from nlp import getOutlinks, Page, determineInlinks, HITS from nlp import Rules, Lexicon # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by function's within nlp.py @@ -131,8 +131,11 @@ def test_getOutlinks(): def test_HITS(): - # TODO - assert True # leave for now + HITS('inherit') + auth_list = [pA.authority, pB.authority, pC.authority, pD.authority, pE.authority, pF.authority] + hub_list = [pA.hub, pB.hub, pC.hub, pD.hub, pE.hub, pF.hub] + assert max(auth_list) == pD.authority + assert max(hub_list) == pE.hub if __name__ == '__main__': From 6ad618eeca49ae05895e1e446e2a4cd6a30e1d61 Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Thu, 13 Apr 2017 11:33:25 +0530 Subject: [PATCH 3/4] fixed premature updation --- nlp.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nlp.py b/nlp.py index 365d726c2..5a74f4d76 100644 --- a/nlp.py +++ b/nlp.py @@ -389,9 +389,14 @@ def HITS(query): p.authority = 1 p.hub = 1 while True: # repeat until... convergence - for p in pages.values(): - p.authority = sum(x.hub for x in getInlinks(p)) # p.authority ← ∑i Inlinki(p).Hub - p.hub = sum(x.authority for x in getOutlinks(p)) # p.hub ← ∑i Outlinki(p).Authority + updated_authority = {} + updated_hub = {} + for p in pages: + updated_authority[p] = sum(x.hub for x in getInlinks(pages[p])) # p.authority ← ∑i Inlinki(p).Hub + updated_hub[p] = sum(x.authority for x in getOutlinks(pages[p])) # p.hub ← ∑i Outlinki(p).Authority + for p in pages: + pages[p].authority = updated_authority[p] + pages[p].hub = updated_hub[p] normalize(pages) if convergence(): break From 911b57a7aed9f3456830e68e3a2208f4d4cba097 Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Thu, 13 Apr 2017 15:56:26 +0530 Subject: [PATCH 4/4] Refactor code to match pseudocode --- nlp.py | 15 ++++++--------- tests/test_nlp.py | 4 ++-- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/nlp.py b/nlp.py index 5a74f4d76..bd26d0a7b 100644 --- a/nlp.py +++ b/nlp.py @@ -356,13 +356,13 @@ def detect(self): def getInlinks(page): if not page.inlinks: page.inlinks = determineInlinks(page) - return [p for addr, p in pagesIndex.items() if addr in page.inlinks] + return [addr for addr, p in pagesIndex.items() if addr in page.inlinks] def getOutlinks(page): if not page.outlinks: page.outlinks = findOutlinks(page) - return [p for addr, p in pagesIndex.items() if addr in page.outlinks] + return [addr for addr, p in pagesIndex.items() if addr in page.outlinks] # ______________________________________________________________________________ @@ -389,14 +389,11 @@ def HITS(query): p.authority = 1 p.hub = 1 while True: # repeat until... convergence - updated_authority = {} - updated_hub = {} + authority = {p: pages[p].authority for p in pages} + hub = {p: pages[p].hub for p in pages} for p in pages: - updated_authority[p] = sum(x.hub for x in getInlinks(pages[p])) # p.authority ← ∑i Inlinki(p).Hub - updated_hub[p] = sum(x.authority for x in getOutlinks(pages[p])) # p.hub ← ∑i Outlinki(p).Authority - for p in pages: - pages[p].authority = updated_authority[p] - pages[p].hub = updated_hub[p] + pages[p].authority = sum(hub[x] for x in getInlinks(pages[p])) # p.authority ← ∑i Inlinki(p).Hub + pages[p].hub = sum(authority[x] for x in getOutlinks(pages[p])) # p.hub ← ∑i Outlinki(p).Authority normalize(pages) if convergence(): break diff --git a/tests/test_nlp.py b/tests/test_nlp.py index af5b280ce..99803d025 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -122,12 +122,12 @@ def test_detectConvergence(): def test_getInlinks(): inlnks = getInlinks(pageDict['A']) - assert sorted([page.address for page in inlnks]) == pageDict['A'].inlinks + assert sorted(inlnks) == pageDict['A'].inlinks def test_getOutlinks(): outlnks = getOutlinks(pageDict['A']) - assert sorted([page.address for page in outlnks]) == pageDict['A'].outlinks + assert sorted(outlnks) == pageDict['A'].outlinks def test_HITS():