In [1]:
import sys

sys.path.append('c:/program files/python38/lib/site-packages')

In [2]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import pandas as pd
import os.path
import csv
import time
import logging

In [5]:
hospitals = pd.read_excel(r'C:\Users\achopra\Documents\Aarete\Provider Transparency Project\Hospitals.xlsx', sheet_name='Hospitals')
hospitals.info()
hospitals.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7597 entries, 0 to 7596
Data columns (total 44 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   X                                             7596 non-null   float64       
 1   Y                                             7596 non-null   float64       
 2   OBJECTID                                      7596 non-null   float64       
 3   ID                                            7596 non-null   float64       
 4   NAME                                          7597 non-null   object        
 5   ADDRESS                                       7597 non-null   object        
 6   CITY                                          7597 non-null   object        
 7   STATE                                         7597 non-null   object        
 8   ZIP                                           7597 non-null   int64 

Unnamed: 0,X,Y,OBJECTID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,Downloadable Data,Type,Price URL,ChargeMaster,Hospital Standard Charges,Average Charges billed per DRg,Hospital Price Transparency charge info,Hospital price transparency case information,COVID Test,Shoppable Services
0,-13318890.0,4346975.0,1.0,5793230.0,CENTRAL VALLEY GENERAL HOSPITAL,1025 NORTH DOUTY STREET,HANFORD,CA,93230,NOT AVAILABLE,...,,,,,,,,,,
1,-13226510.0,4049626.0,2.0,53391362.0,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,NOT AVAILABLE,...,,,,,,,,,,
2,-13156200.0,4031978.0,3.0,11190023.0,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,NOT AVAILABLE,...,,,,,,,,,,
3,-13171900.0,4041752.0,4.0,17090028.0,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,NOT AVAILABLE,...,,,,,,,,,,
4,-13132080.0,4037270.0,5.0,23691706.0,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,NOT AVAILABLE,...,,,,,,,,,,


In [6]:
ny_hospitals = hospitals[hospitals['STATE'] == 'NY'].reset_index(drop=True)
ny_hospitals.shape

(275, 44)

In [15]:
def get_prov_info(driver, wait_time):
    logger.debug('Finding information tabs for provider. Common options are "Provider" and "Disclaimer".')
    info_tabs = driver.find_element_by_xpath('//div[contains(@class, "more-information-tabs")]').find_element_by_tag_name('ul').find_elements_by_tag_name('li')
    ind = None
    for i in range(len(info_tabs)):
        if info_tabs[i].text == 'Provider':
            ind = i
            break
    logger.debug(f'Found index of "Provider tab at index {ind}."')

    info_tabs[i].click()
    time.sleep(wait_time)
    logger.debug('Fetching provider name...')
    name = driver.find_element_by_xpath('//div[contains(@class, "hospital-info-title")]').text
    logger.debug('Fetching provider address...')
    address = driver.find_element_by_xpath('//div[contains(@class, "hospital-detail-map")]').text.split('\n')[1]
    return [name] + [address]

def get_comparison_stats(driver):
    stats = {}
    logger.debug('Fetching comparison statistics...')
    price_comp_stats = driver.find_element_by_xpath('//div[contains(@class, "hospital-pricing-comparison")]').find_elements_by_tag_name('ul')
    for stat in price_comp_stats:
        stats[stat.find_element_by_tag_name('h3').text] = stat.find_element_by_tag_name('h4').text
    return stats
    
def get_cash_price(driver):
    logger.debug('Fetching cash price...')
    return driver.find_element_by_xpath('//div[contains(@class, "cost-estimation-calculator")]').find_element_by_id('cashPriceAmount').text
    
def get_insurance_price(driver, wait_time):
    prices = {}
    try:
        driver.find_element_by_xpath('//div[contains(@class, "payment-select-button")]').click() # click insurance price button
    except Exception:
        logger.warning('Insurance prices not available! Expect missing value in insurance price field.')
        return None
    
    ins_plans = Select(driver.find_element_by_id('insurance-plan-selection')).options[1:-1]
    for plan in ins_plans:
        plan_name = plan.text
        if plan_name in prices:
            continue
            
        logger.debug(f'Fetching insurance price for {plan_name}...')
        plan.click() # select plan in dropdown
        time.sleep(wait_time / 2)
        try:
            prices[plan_name] = driver.find_element_by_xpath('//div[contains(@class, "estimation-calculator")]').find_element_by_xpath('//*[@id="app"]/div[1]/div[1]/div[2]/div/div[1]/ul/li[1]/span').text
        except Exception:
            logger.warning(f'Insurance price not available for {plan_name}!')
            
    return prices
    
def get_data_for_provider(driver, prov_name, prov_zip_code, wait_time=0.5, max_pages=float('inf'), max_services=float('inf'), output_csv_path=None):
    i = 1
    
    if not output_csv_path:
        data = []

    while True and i <= max_pages:   
        prov_name_url = '-'.join(prov_name.lower().split(' '))
        logger.debug(f'Opening TH URL for provider: {prov_name} (page {i}).')
        driver.get(f'https://turquoise.health/service_offerings?q=&service_name=&location={prov_zip_code}&provider_name={prov_name_url}&page={i}&distance=10')

        num_services = min(len(driver.find_elements_by_xpath('//div[contains(@class, "service-info-cont")]')), max_services)
        
        if num_services == 0:
            logger.warning(f'No information available for provider: {prov_name}! Moving to next provider...')
            return None
        
        for j in range(num_services):
            logger.debug(f'Scraping service # {j} on page {i}.')
            logger.debug('Scraping service information...')
        
            try:
                serv_info = driver.find_elements_by_xpath('//div[contains(@class, "service-info-cont")]')[j].text.split('\n')
            except Exception:
                logger.warning(f'No service information available for provider: {prov_name}! Expect missing value for service name and CPT fields!')
                continue
            
            logger.debug('Scraping rate information...')   
            try:
                driver.find_elements_by_xpath('//a[contains(@class, "rate-button")]')[j].click()
                time.sleep(wait_time)
            except Exception:
                logger.warning(f'No rates available for provider: {prov_name} for service: {serv_info[1]}! This provider will not be added to output table.')
                continue
            
            try:
                prov_info = get_prov_info(driver, wait_time)
            except Exception:
                logger.warning(f'Could not fetch provider info for provider: {prov_name} for service: {serv_info[1]}!')
                prov_info = None
                
            try:
                comp_stats = get_comparison_stats(driver)
            except Exception:
                logger.warning(f'Could not fetch comparison stats for provider: {prov_name} for service: {serv_info[1]}!')
                comp_stats = None
            
            try:
                cash_price = get_cash_price(driver)
            except Exception:
                logger.warning(f'Could not fetch cash price for provider: {prov_name} for service: {serv_info[1]}!')
                cash_price = None
            
            try:
                ins_price = get_insurance_price(driver, wait_time)
            except Exception:
                logger.warning(f'Could not fetch insurance price for provider: {prov_name} for service: {serv_info[1]}!')
                ins_price = None
            

            row = [prov_name, prov_zip_code] + serv_info[:2] + [cash_price, ins_price, comp_stats] + [prov_info]
            logger.debug('Appening row to table...')
            if output_csv_path: 
                 with open(output_csv_path, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow(row)
            else: data.append(row)
            
            driver.back()
            time.sleep(wait_time)

        i += 1
        
    if not output_csv_path:
        prov_data = pd.DataFrame(data, columns=['Provider', 'Zip Code', 'CPT', 'Service', 'Cash Price', 'Insurance Price', 'Comparison Statistics', 'TQH Name', 'TQH Address'])
        return prov_data
        

In [16]:
logger = logging.getLogger('webscraper')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('webscraper_progress.log', mode='w')
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(fh)

# pricing_df = pd.DataFrame()
output_csv_path = r'C:\Users\achopra\Documents\Aarete\Provider Transparency Project\prov_data.csv'
with open(output_csv_path, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Provider', 'Zip Code', 'CPT', 'Service', 'Cash Price', 'Insurance Price', 'Comparison Statistics', 'TQH Name', 'TQH Address'])

chrome_options = Options()
chrome_options.add_argument('--incognito')
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--proxy-server='direct://'")
chrome_options.add_argument("--proxy-bypass-list=*")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--headless")
chrome_options.add_argument("no-sandbox")

logger.info('Installing and setting up Chrome driver.')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=chrome_options)

for ind, row in hospitals.iterrows():
    name = row['NAME']
    zip_c = row['ZIP']
    logger.info(f'Fetching data for provider: {name} with zipcode: {zip_c}')
    prov_data = get_data_for_provider(driver, name, zip_c, wait_time=1.5, output_csv_path=output_csv_path)
#     if prov_data:
#         pricing_df = pd.concat([pricing_df, prov_data]).reset_index(drop=True)

logger.info('Webscraping complete!')
driver.quit()



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [C:\Users\achopra\.wdm\drivers\chromedriver\win32\93.0.4577.63\chromedriver.exe] found in cache


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\achopra\\Documents\\Aarete\\Provider Transparency Project\\prov_data.csv'