In [1]:
from bs4 import BeautifulSoup
import requests

In [113]:
escape_chars = ['/','\n','/n','\r']

def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.prettify()
    return soup

def clean_string(string: str) -> str:
    string = clean_escapes(string)
    #string = clean_punctuation(string)
    
    return string

def clean_escapes(string: str):
    for esc in escape_chars:
        string = string.replace(esc,'')
        #string = string.replace('/n','')
        #string = string.replace('\n','')
        #string = string.replace('\r','')

    return string

def clean_punctuation(string: str, keep_emo_punc=False) -> str:
    string = string.replace('&amp;', '&')

def convert_html_list(li: list):
    clean_li = []
    for elem in li:
        if elem not in escape_chars:
            try:
                string = ''
                if isinstance(elem, str):
                    string = elem.strip()
                else:
                    string = elem.text.strip()
                
                #check if empty
                if string:
                    clean_li.append(string)
            except Exception as e:
                print(f'Error caught: {e}')
                continue
            
    return clean_li
    #return [tag.text for tag in li if type(tag) == 'li']

def goodtx_get_all_data(soup: BeautifulSoup) -> dict:
    all_data = {}
    all_data['name'] = goodtx_get_name(soup)
    all_data['writing_sample'] = goodtx_get_writing_sample(soup)
    issues = goodtx_get_tx_issues(soup)
    all_data['issues'] = convert_html_list(issues)
    orientations = goodtx_get_orientations(soup)
    all_data['orientations'] = convert_html_list(orientations)
    services = goodtx_get_services(soup)
    all_data['services'] = convert_html_list(services)
    age_groups = goodtx_get_client_ages(soup)
    all_data['ages'] = convert_html_list(age_groups)
    professions = goodtx_get_professions(soup)
    all_data['professions'] = convert_html_list(professions)
    all_data['credential'] = goodtx_get_primary_credential(soup)
    all_data['license_status'] = goodtx_get_license_status(soup)
    all_data['website'] = goodtx_get_website(soup)
    all_data['address'] = goodtx_get_address(soup)
    all_data['phone'] = goodtx_get_phone(soup)
    
    return all_data

def goodtx_get_name(soup: BeautifulSoup) -> str:
    name = soup.find('h1', id='profileTitle_id').contents[1].get_text()
    return clean_escapes(name)

def goodtx_get_writing_sample(soup: BeautifulSoup) -> str:
    desc = soup.find_all('div', class_='profileBottomLeft')
    all_text = desc[0].find_all('div', class_='text')
    good_stuff = []
    for txt in all_text:
        for child in txt.children:
            if(child.name == 'p'):
                good_stuff.append(child.get_text())

    good_stuff_st = ''.join(good_stuff)
    return good_stuff_st

def goodtx_get_tx_issues(soup: BeautifulSoup)-> list:
    issues_html = soup.find_all('ul', id='issuesData')
    issues_list = list(issues_html[0].children)
    
    ##if want to return string instead
    # issues_str = issues_html[0].get_text()
    # clean_str = clean_string(issues_str)
    
    return issues_list

def goodtx_get_orientations(soup: BeautifulSoup)-> list:
    orientations_html = soup.find_all('ul', id='modelsData')
    orientations_list = list(orientations_html[0].children)
    
    ##if want to return string instead
    # issues_str = issues_html[0].get_text()
    # clean_str = clean_string(issues_str)
    
    return orientations_list

def goodtx_get_services(soup: BeautifulSoup)-> list:
    services_html = soup.find_all('ul', id='servicesprovidedData')
    services_list = list(services_html[0].children)
    
    return services_list

def goodtx_get_client_ages(soup: BeautifulSoup) -> list:
    ages_html = soup.find_all('ul', id='agesData')
    ages_list = list(ages_html[0].children)
    
    return ages_list

def goodtx_get_professions(soup: BeautifulSoup) -> list:
    profs_str = soup.find('span', id='professionsDefined').get_text()
    profs_list = profs_str.split(',')
    
    return [prof.strip() for prof in profs_list]

def goodtx_get_primary_credential(soup: BeautifulSoup) -> str:
    credential = credential_type_str = soup.find('span', id='licenceinfo1').get_text()
    
    return clean_escapes(credential)

def goodtx_get_license_status(soup: BeautifulSoup) -> str:
    license_status = soup.find('span', id='license_status_id').get_text()
    
    return clean_escapes(license_status)

def goodtx_get_website(soup: BeautifulSoup) -> str:
    try:
        website = soup.find('a', id='edit_website')['href']
    except:
        website = 'None'
    return website

def goodtx_get_address(soup: BeautifulSoup) -> dict:
    office = soup.find('div', id='editOffice1')
    address = {}
    
    address['street'] = soup.find('span', itemprop='streetAddress').get_text()
    address['city'] = soup.find('span', itemprop='addressLocality').get_text()
    address['state'] = soup.find('span', itemprop='addressRegion').get_text()
    address['zip'] = soup.find('span', itemprop='postalCode').get_text()
    
    return address

def goodtx_get_phone(soup: BeautifulSoup) -> str:
    phone  =soup.find('span', {'class':'profilePhone'}).text
    #phone = soup.find('span', class='profilePhone').contents[1].contents[0].get_text()

    return clean_string(phone)

In [59]:
li_test = goodtx_get_professions(soup)
li_test

['Psychotherapist']

In [55]:
result = convert_html_list(li_test)
result

Error caught: 'NavigableString' object has no attribute 'text'
Error caught: 'NavigableString' object has no attribute 'text'


['Individual Therapy & Counseling', 'Telehealth']

In [75]:
#if __name__ == '__main__':
#url = 'https://www.goodtherapy.org/therapists/profile/clare-comstock-20170911'
url = 'https://www.goodtherapy.org/therapists/profile/morgan-dingle-20191203'
#therapists = ['https://www.goodtherapy.org/therapists/profile/andrea-risi-20130730']
#url = 'http://www.inbetweentherapy.com'
soup = get_soup(url = url)


#soup_no_website = get_soup('https://www.goodtherapy.org/therapists/profile/julie-reichenberger-20200327')

In [7]:
for h in list(soup.find_all('h2')):
    print(h.get_text())

Clare Comstock
LCSW
LCSW
LCSW
Offices
My Approach to Helping
Services I Provide
Ages I Work With
Languages
Client Concerns I Treat
Types of Therapy
About GoodTherapy
Resources
Subscribe to Newsletter
Follow GoodTherapy


In [115]:
therapist_info = goodtx_get_all_data(soup)
therapist_info

{'name': 'Morgan Dingle',
 'writing_sample': 'The days are blurring together. There’s a pressure in your chest that won’t go away. It’s hard to breathe because you are so overwhelmed. You see everything that’s not working but feel out of control. Distracting yourself with friends, alcohol, exercise, etc. used to work. Now, you’re stuck at home, unable to distract yourself for long. You hate who you are. Up to this point, you were surviving, but you want more. You think, “if only I wasn’t so sensitive.” Your whole life you’ve felt small, judged, shamed. There has to be more. You don’t want to be that person anymore, and you need help. You need someone to see the real you.\r If you could add one person to your life that would’ve changed everything who would that be? A parent, sibling, mentor? Together, we’ll take on whatever is keeping you trapped in your depression and anxiety. My process dives into how your mind and body keep you spinning in downward cycles. You will experience relief 

In [80]:
profs_str = soup.find('span', id='professionsDefined').text
profs_list = profs_str.split(',')
profs_list

['Counselor', ' Mental Health Counselor', ' Psychotherapist']

In [82]:
soup.find('span', itemprop='telephone').contents[1].contents[0]

'\n'

In [100]:
soup.find('span', {'class':'profilePhone'}).text

'\n\n720-605-3679\n\n'

In [114]:
goodtx_get_phone(soup)

'720-605-3679'