In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install asyncio
!pip install pynamodb
!pip install cloudscraper
!pip install unicodedata2

In [262]:
async def unique(list1):
    unique_list = []
 
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    # print list
    return unique_list

### DynamoDB model for saving urls to a database

In [274]:
from pynamodb.models import Model, DoesNotExist
from pynamodb.attributes import ( UnicodeAttribute, UTCDateTimeAttribute )
import datetime
import os
os.environ['AWS_ACCESS_KEY_ID'] = "xyz"
os.environ['AWS_SECRET_ACCESS_KEY'] = "abc"

class Page(Model):
    class Meta:
        region = 'us-west-2'
        table_name = 'Pages'
    website = UnicodeAttribute(hash_key=True)
    url = UnicodeAttribute(range_key=True)
    subject = UnicodeAttribute(null=True)
    title = UnicodeAttribute(null=True)
    text = UnicodeAttribute(null=True)
    updatedAt = UTCDateTimeAttribute(default=datetime.datetime.now())

In [234]:
Page.create_table(read_capacity_units=1, write_capacity_units=1, wait=True)

### Extract links from WebMD's directory

In [458]:
import cloudscraper
import unicodedata2
from bs4 import BeautifulSoup
import asyncio
from IPython.display import Markdown, display
scraper = cloudscraper.create_scraper(delay=10, browser='chrome')

In [375]:
async def scrapeLinks(letter):
    html = scraper.get(f'https://www.webmd.com/a-to-z-guides/health-topics?pg={letter}')
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text, "html.parser")
    for tag in soup(["script", "style", "noscript", "head", "meta", "img", "aside", "header", "footer", "nav"]): # remove all javascript and stylesheet code
        tag.extract()
    for link in soup.findAll("a", {"data-metrics-link" : True}):
        link.decompose()
    urls = []
    for link in soup.find_all('a', href=True):
        urls.append(link['href'])
    return await unique(urls)

In [None]:
letters = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
allLinks = []
for letter in letters:
    print(letter)
    allLinks = allLinks + await scrapeLinks(letter)
allLinks = await unique(allLinks)
for link in allLinks:
    await createPage(link)

In [None]:
allLinks

### Function to scrape basic content pages

In [459]:
async def scrapeLink(url):
    if Page.get('webmd', url).text is None:
        html = scraper.get(url)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text, "html.parser")
        for tag in soup(["script", "style", "noscript", "head", "meta", "img"]):
            tag.extract()
        article = soup.find("article")
        if article is not None:
            articleLinksContainer = article.find("aside")
            urls = []
            if articleLinksContainer is not None:
                for link in articleLinksContainer.find_all('a', href=True):
                    try:
                        Page.get('webmd', url)
                    except DoesNotExist:
                        page = Page('webmd', link, subject='health')
                        page.save()
                        continue

            text = ''
            articleBody = article.find("div", {"class": "article__body"})
            if articleBody is None:
                articleBody = article.find("div", {"class": "article-body"})
            if articleBody is not None:
                for e in articleBody.descendants:
                    if isinstance(e, str):
                        text += e
                    elif e.name in ['br', 'p', 'h1', 'h2', 'h3', 'h4','tr', 'th']:
                        text += '\n\n'
                    elif e.name == 'li':
                        text += '\n- '
                text = unicodedata2.normalize("NFKD",text)
                page = Page.get('webmd', url)
                page.update(actions=[Page.text.set(text)])

### Function to scrape content from slideshow content

In [420]:
async def scrapeSlides(url):
    if Page.get('webmd', url).text is None:
        html = scraper.get(url)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text, "html.parser")
        for tag in soup(["script", "style", "noscript", "head", "meta", "img", "picture"]):
            tag.extract()
        for slideCount in soup.find_all("div", {"class": "slide-count-bbl"}):
            slideCount.extract()
        slides = soup.find_all("div", {"class": "slide"})

        text = ''
        for slide in slides:
            for e in slide.descendants:
                if isinstance(e, str):
                    text += e.strip()
                elif e.name in ['br', 'p', 'h1', 'h2', 'h3', 'h4','tr', 'th']:
                    text += '\n\n'
                elif e.name == 'li':
                    text += '\n- '
        text = unicodedata2.normalize("NFKD",text.strip())
        page = Page.get('webmd', url)
        page.update(actions=[Page.text.set(text)])

### Function to scrape links from resource centers

In [451]:
async def scrapeResource(url):
    print(url)
    html = scraper.get(url)
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text, "html.parser")
    for tag in soup(["script", "style", "noscript", "head", "meta", "img"]):
        tag.extract()
        
    linksContainer = soup.find("div", {"id": "key-links"})
    if linksContainer == None:
        linksContainer = soup.find("section", {"id": "cncr-cncrtab"})
    if linksContainer is not None:
        for link in linksContainer.find_all('a', href=True):
            try:
                Page.get('webmd', link['href'])
            except DoesNotExist:
                page = Page('webmd', link['href'], subject='health')
                page.save()
                continue

### Query links saved in DynamoDB and scrape their content

In [461]:
allLinks = []
for item in Page.query('webmd'):
    allLinks.append(item.url)
newLinks = []
slideLinks = []
normalLinks = []
for link in allLinks:
    if '.htm' in link:
        newLinks.append(link)
    elif 'slideshow' in link:
        slideLinks.append(link)
    else:
        normalLinks.append(link)

In [None]:
# Re run the above cell after scraping additional links from resource centers
for link in newLinks:
    await scrapeResource(link)

In [460]:
for link in normalLinks:
    await scrapeLink(link)

In [430]:
for link in slideLinks:
    await scrapeSlides(link)

In [None]:
notWorkingLinks = ["https://www.webmd.com/parenting/baby/default.htm", 'https://www.webmd.com/connect-to-care/addiction-treatment-recovery/default.htm', "https://www.webmd.com/healthy-aging/default.htm", "https://www.webmd.com/beauty/default.htm", "https://www.webmd.com/breast-cancer/default.htm", "https://www.webmd.com/cancer/default.htm", 'https://www.webmd.com/breast-cancer/default.htm']
for link in newLinks:
    if link in notWorkingLinks:
        continue
    else:
        await scrapeResource(link)

In [464]:
# Write to a text file that can be used in Faiss Index or other query methods
texts = []
for item in Page.query('webmd'):
    if item.text:
        texts.append(item.text.strip())
text = "\n\n".join(texts)
with open(r"webmd.txt", 'w') as fp:
    fp.write(text.strip())