### How To Scrape Medical Spa Owner Emails using URLLib and Beautiful Soup

1) Open our CSV file and write the header row

2) Loop through a list of zip codes representing the geographical center of each state in the U.S.

3) Use URLLib to fetch a Search Results Page that lists out "provider cards" for every SculpSure provider within a 500-mile radius of each zip code and use Beautiful Soup to parse the html code.

4) Open the web page associated with each "provider card" and use Beatiful Soup to generate parsed html code, from which we can extract the website URL and all email addresses for the provider.

5) Close the CSV file

DONE!

In [2]:
import urllib.request
from bs4 import BeautifulSoup as soup

# Open csv file and write headers
filename = "sculpsure_providers5.csv"
f = open(filename, "w", encoding='utf8', errors='ignore')
headers = "name,url,email\n"
f.write(headers)

# Initialize list of providers
provider_names = []

#zip_list = [1]
zip_list = [99777]
#zip_list = [1604,2893,3217,4443,5669,6037,8618,13501,16823,19901,21035,23921,26601,27330,29229,31204,34609,35045,37130,39051,40422,43081,46123,49601,50010,54449,56401,57501,58463,59457,62563,65109,67530,68822,71351,72204,73160,76825,80918,82520,83226,84642,86322,87063,89310,93643,96763,97754,98801,99777]

for zip_int in zip_list:
    zip_str = str(zip_int).rjust(5,'0')
    validZip = True

    # Process zip code
    print("Processing ZIP: " + zip_str)
    print()

    my_url = 'https://www.sculpsure.com/results/?campaign-code=default&session-id=default&country=us&treatment=33&zipcode=' + zip_str + '&proximity=500'

    page = urllib.request.Request(my_url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}) 
    infile = urllib.request.urlopen(page).read()
    page_html = infile.decode('ISO-8859-1')

    # Get "page soup" from Beautiful Soup for the search results page
    page_soup = soup(page_html, "html.parser")

    # If we get a "notification", it means no providers for this geographical area
    notifications = page_soup.findAll("div",{"class":"notification__content"})
    
    if len(notifications) > 0:
        print("No providers found for zip code: " + zip_str)
    else:
        # Get all "provider cards" for each SculpSure provider for this geographical area
        provider_cards = page_soup.findAll("div",{"class":"provider-card"})

        # Extract data from url associated with each provider card
        for provider_card in provider_cards:
            provider_name = provider_card.h3.a.text.strip().replace(",","|")

            if provider_name not in provider_names:
                # Add provider to list of providers
                provider_names.append(provider_name)
                print(provider_name)

                # Get provider page html and turn it into parsed soup
                provider_url = provider_card.h3.a['href']
                provider_page = urllib.request.Request(provider_url,headers={'User-Agent': 'Mozilla/5.0'})
                provider_infile = urllib.request.urlopen(provider_page).read()
                provider_page_html = provider_infile.decode('ISO-8859-1')
                provider_page_soup = soup(provider_page_html, "html.parser")

                # Get the provider's Website URL
                provider_website_urls = provider_page_soup.findAll("ul",{"class":"provider-contact__details"})

                if len(provider_website_urls) > 0:
                    provider_website_url = provider_website_urls[0].a['href']                        

                    # Get all email addresses for the provider
                    provider_form = provider_page_soup.findAll("div",{"class":"provider-form provider-form--left-aligned"})[0]
                    email_script = provider_form.findAll("script")[1].string
                    email_start = email_script.find('providersemail') + 18
                    email_end = email_script.find('"',email_start,len(email_script))
                    provider_emails = email_script[email_start:email_end].replace(",","|")

                    # Split up list of emails
                    provider_emails_split = provider_emails.split(";")

                    # Write provider name, website URL and emails to csv file
                    f.write(provider_name + "," + provider_website_url);
                    if len(provider_emails_split) > 0:
                        for provider_email in provider_emails_split:
                            if provider_email.find('@hologic.com') == -1:
                                if provider_email.find('@cynosure.com') == -1:
                                    f.write("," + provider_email)
                                    print(provider_email)
                        print("")
                    f.write("\n")
                else:
                    f.write(provider_name + "\n")
                    print("")
                    
print("SCRAPING COMPLETE!")

# Close csv file
f.close()


Processing ZIP: 99777

Revive Aesthetics
reviveak@gmail.com

Nick Sarrimanolis| MD LLC
dr.sarrimanolis@icloud.com

Integrated Wellness & Center for Birth| LLC
shantelhoversten@akbirth.com

Alaska Center for Dermatology
sculpsure@dermalaska.com

Fortson Dermatology and Skin Care Center

Borealis Laser
borealislaser@gmail.com
info@borealislaser.com

All Seasons Family Health Care
allseasonsfamilyhealthcare@gmail.com
office@allseasonsfhc.com

Dale Joseph Trombley II| MD
info@alaskaprivatepractice.com
ConciergeDoctor@quixnet.net

SCRAPING COMPLETE!
