Collect sentences containing african from scraped pages. Had to insta…

…ll an older websockets: miyakogi/pyppeteer#171
bomanimc · Apr 18, 2019 · 0d8661b · 0d8661b
1 parent eaaa627
commit 0d8661b
Showing 1 changed file with 35 additions and 0 deletions.
diff --git a/race_phrases.py b/race_phrases.py
@@ -0,0 +1,35 @@
+import re
+import time
+from requests_html import HTMLSession
+
+FILE_PATH = 'webmd_results.txt'
+SEARCH_TERM = 'african'
+
+def main():
+    session = HTMLSession()
+
+    lines_of_interest = []
+
+    with open(FILE_PATH) as fp:  
+        for cnt, line in enumerate(fp):
+            print(line + "\n")
+
+            resp = session.get(line.strip())
+            resp.html.render()
+            article_p = resp.html.find('p')
+
+            for p in article_p:
+                # TODO: Us a better approach for splittng the string into sentences
+                sentences = re.findall(r"([^.]*?%s[^.]*\.)" % SEARCH_TERM, p.text.lower())
+                print(sentences);
+                lines_of_interest += sentences
+
+            time.sleep(5)
+
+            if (cnt > 50):
+                break;
+
+
+    print(lines_of_interest)
+
+main()