Skip to content

Commit

Permalink
Collect sentences containing african from scraped pages. Had to insta…
Browse files Browse the repository at this point in the history
…ll an older websockets: miyakogi/pyppeteer#171
  • Loading branch information
bomanimc committed Apr 18, 2019
1 parent eaaa627 commit 0d8661b
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions race_phrases.py
@@ -0,0 +1,35 @@
import re
import time
from requests_html import HTMLSession

FILE_PATH = 'webmd_results.txt'
SEARCH_TERM = 'african'

def main():
session = HTMLSession()

lines_of_interest = []

with open(FILE_PATH) as fp:
for cnt, line in enumerate(fp):
print(line + "\n")

resp = session.get(line.strip())
resp.html.render()
article_p = resp.html.find('p')

for p in article_p:
# TODO: Us a better approach for splittng the string into sentences
sentences = re.findall(r"([^.]*?%s[^.]*\.)" % SEARCH_TERM, p.text.lower())
print(sentences);
lines_of_interest += sentences

time.sleep(5)

if (cnt > 50):
break;


print(lines_of_interest)

main()

0 comments on commit 0d8661b

Please sign in to comment.