From 25df9a25259f69dc21402447a0a9e0ffe8e43a0e Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Tue, 18 Apr 2017 18:36:10 +0530 Subject: [PATCH 1/2] Modified relevant_pages() --- nlp.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/nlp.py b/nlp.py index bd26d0a7b..268a2b155 100644 --- a/nlp.py +++ b/nlp.py @@ -301,15 +301,17 @@ def expand_pages(pages): def relevant_pages(query): - """Relevant pages are pages that contain the query in its entireity. - If a page's content contains the query it is returned by the function.""" - relevant = {} - print("pagesContent in function: ", pagesContent) - for addr, page in pagesIndex.items(): - if query.lower() in pagesContent[addr].lower(): - relevant[addr] = page - return relevant - + """Relevant pages are pages that contain all of the query words. They are obtained by + intersecting the hit lists of the query words.""" + hit_intersection = {addr for addr in pagesIndex} + query_words = query.split() + for query_word in query_words: + hit_list = set() + for addr in pagesIndex: + if query_word.lower() in pagesContent[addr].lower(): + hit_list.add(addr) + hit_intersection = hit_intersection.intersection(hit_list) + return {addr: pagesIndex[addr] for addr in hit_intersection} def normalize(pages): """From the pseudocode: Normalize divides each page's score by the sum of From 0429539a21f70c4dc0d58c29c19016663bae3deb Mon Sep 17 00:00:00 2001 From: Chipe1 Date: Tue, 18 Apr 2017 18:38:57 +0530 Subject: [PATCH 2/2] Additional tests for relevant_pages() --- tests/test_nlp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 81eef882d..d0ce46fbc 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -30,7 +30,7 @@ def test_lexicon(): href="https://google.com.au" < href="/wiki/TestThing" > href="/wiki/TestBoy" href="/wiki/TestLiving" href="/wiki/TestMan" >""" -testHTML2 = "Nothing" +testHTML2 = "a mom and a dad" testHTML3 = """ @@ -106,9 +106,13 @@ def test_expand_pages(): def test_relevant_pages(): - pages = relevant_pages("male") - assert all((x in pages.keys()) for x in ['A', 'C', 'E']) + pages = relevant_pages("his dad") + assert all((x in pages) for x in ['A', 'C', 'E']) assert all((x not in pages) for x in ['B', 'D', 'F']) + pages = relevant_pages("mom and dad") + assert all((x in pages) for x in ['A', 'B', 'C', 'D', 'E', 'F']) + pages = relevant_pages("philosophy") + assert all((x not in pages) for x in ['A', 'B', 'C', 'D', 'E', 'F']) def test_normalize():