In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import re
import pandas as pd

In [2]:
rootPath = "https://www.dermquest.com"

In [3]:
def Get_URLandLabel(dx_label,labelPath,max_imgs):
    browser = webdriver.Chrome()
    browser.get("https://www.dermquest.com/image-library/image-search")
    closeBtn1 = browser.find_element_by_css_selector(".fancybox-item")  # or perhaps ".fancybox-item.fancybox-close"
    closeBtn2 = browser.find_element_by_css_selector(".btn.btn-minWidth.btn-onDarkBg")
    closeBtn1.click()
    closeBtn2.click()
    
    diagnosis = browser.find_element_by_xpath('//a[@href="#diagnosis"]') # a tag element with label 'href' and value 'diagnosis'
    diagnosis.click()
    
    # click the initial character on the banner to activate it
    bannerBtn_path = "//a[@href=\"#alpha-" + str(dx_label[0]).upper() + str("\"]")
    bannerBtn = browser.find_element_by_xpath(bannerBtn_path)
    bannerBtn.click()
    # locate the exact dx_label
    dx_label = browser.find_element_by_xpath(labelPath)
    dx_label.click()
    
    # go to view images
    viewBtn = browser.find_element_by_css_selector(".btn.search-trigger")
    viewBtn.click()
    
    img_list = []
    lesion_label_list = []
    while len(img_list) < max_imgs:
        # extract html element from current page 
        page = browser.find_element_by_xpath("//*").get_attribute("outerHTML")
        soup = BeautifulSoup(page, 'lxml')
        all_div_tags = soup.find_all('div',attrs={'class': 'inner'})
        for d in all_div_tags:
            if d.p.span.a: 
                # get each image's a tag link, open it 
                imageLink = rootPath + d.p.span.a['href']
                subpage = urlopen(imageLink)
                # process this 2nd layer webpage 
                subsoup = BeautifulSoup(subpage, 'lxml')
                target = subsoup.find_all('li', attrs={'class': 'selected'})
                # info of lesion type is on the right side of this 2nd layer webpage
                # exact lesion type at: the next order list element after that which has 'Primary Lesion' as text
                lesions = subsoup.find(text=re.compile('Primary Lesions')).parent.findNext('ul').find_all('li')
                img_list.append(rootPath + target[0].figure.a['href'])
                tmp_lesion = []
                for li in lesions:
                    # strip off special characters in the lesion type paragraph; put them altogether
                    tmp_lesion.append(li.getText().strip(' \t\r\n'))
                lesion_label_list.append(tmp_lesion)
                if len(img_list) >= max_imgs:
                    break
        try:
            nextBtn = browser.find_element_by_css_selector(".next.active")
            nextBtn.click()
            time.sleep(2)
        except:
            break
    
    # browser.close()
    # return two lists in a form of dictionary
    return dict(zip(img_list,lesion_label_list))

In [4]:
label_path_1 = '//label[@for="facet_109493"]'  # correspond to: acne vulgaris
label_path_2 = '//label[@for="facet_109831"]'  # correspond to: keratoacanthoma

In [5]:
D1 = Get_URLandLabel('acne vulgaris',label_path_1,100)

In [6]:
print(len(list(D1.keys())))

100


In [7]:
D2 = Get_URLandLabel('keratoacanthoma',label_path_2,100)
print(len(list(D2.keys())))

100


In [8]:
def exportToDF(dx_label,dict_data,col1,col2,col3):
    urls = list(dict_data.keys())
    dx_lbl_list, lesion_lbls = [],[]
    columns = [col1,col2,col3]
    
    for i in range(len(urls)):
        dx_lbl_list.append(dx_label)
        lesion_lbls.append(str(dict_data.get(urls[i])).strip("[',' ,' \t\r\n]").replace("\'",""))
    
    df_result = pd.DataFrame({col1: urls, col2: dx_lbl_list, col3: lesion_lbls})
    df_result.reindex(columns=columns)
    return df_result

In [9]:
df1 = exportToDF('acne vulgaris',D1,'image_url','dx_label','lesion_label')
df2 = exportToDF('keratoacanthoma',D2,'image_url','dx_label','lesion_label')

In [10]:
big_df = pd.concat([df1,df2])
big_df.shape

(200, 3)

In [16]:
big_df.index = range(len(big_df.index))
big_df.tail()

Unnamed: 0,dx_label,image_url,lesion_label
195,keratoacanthoma,https://www.dermquest.com/imagelibrary/large/0...,"Erythema, Nodule, Papule / pearly, Telangiecta..."
196,keratoacanthoma,https://www.dermquest.com/imagelibrary/large/0...,"Papule / erythematous, Erythema"
197,keratoacanthoma,https://www.dermquest.com/imagelibrary/large/0...,"Papule / hyperpigmented, Hyperkeratosis"
198,keratoacanthoma,https://www.dermquest.com/imagelibrary/large/0...,"Papule / hyperpigmented, Hyperkeratosis"
199,keratoacanthoma,https://www.dermquest.com/imagelibrary/large/0...,"Papule / hyperpigmented, Hyperkeratosis"


In [None]:
big_df.to_csv("ScrapingResult_imgUrl.csv")