In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
def dec_check_none(func):
        def func_wrapper(*args, **kwargs):
            try:
                val = func(*args, **kwargs)
                return val
            except Exception as e:
                if isinstance(e, AttributeError):
                    return None
        return func_wrapper

class GoodTherapySoupScraper(object):
    

    def __init__(self, starting_url):
        self.starting_url = starting_url
        self.escape_chars = ['/','\n','/n','\r', '\t']

    def get_soup(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
        page = requests.get(self.starting_url, headers=headers)
        soup = BeautifulSoup(page.content, 'html.parser')
        soup.prettify()
        return soup

    def clean_string(self, string: str) -> str:
        string = self.clean_escapes(string)
        #string = clean_punctuation(string)
        
        return string.strip()

    def clean_escapes(self, string: str):
        for esc in self.escape_chars:
            string = string.replace(esc,'')
            #string = string.replace('/n','')
            #string = string.replace('\n','')
            #string = string.replace('\r','')

        return string

    def clean_punctuation(self, string: str, keep_emo_punc=False) -> str:
        string = string.replace('&amp;', '&')

    def convert_html_list(self, li: list):
        clean_li = []
        for elem in li:
            if elem not in self.escape_chars:
                try:
                    string = ''
                    if isinstance(elem, str):
                        string = elem.strip()
                    else:
                        string = elem.text.strip()
                    
                    #check if empty
                    if string:
                        clean_li.append(string)
                except Exception as e:
                    print(f'Error caught: {e}')
                    continue
                
        return clean_li
        #return [tag.text for tag in li if type(tag) == 'li']
        
    def get_all_data(self, soup: BeautifulSoup) -> dict:
        all_data = {}
        all_data['name'] = self.get_name(soup)
        all_data['writing_sample'] = self.get_writing_sample(soup)
        all_data['issues'] = self.get_tx_issues(soup)
        all_data['orientations'] = self.get_orientations(soup)
        all_data['services'] = self.get_services(soup)
        all_data['ages'] = self.get_client_ages(soup)
        all_data['professions'] = self.get_professions(soup)
        all_data['credential'] = self.get_primary_credential(soup)
        all_data['license_status'] = self.get_license_status(soup)
        all_data['website'] = self.get_website(soup)
        all_data['address'] = self.get_address(soup)
        all_data['phone'] = self.get_phone(soup)
        all_data['verified'] = self.get_verification(soup)
        
        return all_data

    @dec_check_none
    def get_name(self, soup: BeautifulSoup) -> str:
        name = soup.find('h1', id='profileTitle_id').contents[1].get_text()
        return self.clean_escapes(name)

    @dec_check_none
    def get_writing_sample(self, soup: BeautifulSoup) -> str:
        desc = soup.find_all('div', class_='profileBottomLeft')
        all_text = desc[0].find_all('div', class_='text')
        good_stuff = []
        for txt in all_text:
            for child in txt.children:
                if(child.name == 'p'):
                    good_stuff.append(child.get_text())

        good_stuff_st = ''.join(good_stuff)
        return good_stuff_st

    @dec_check_none
    def get_tx_issues(self, soup: BeautifulSoup)-> list:
        issues_html = soup.find_all('ul', id='issuesData')
        issues_list = list(issues_html[0].children)
        
        ##if want to return string instead
        # issues_str = issues_html[0].get_text()
        # clean_str = clean_string(issues_str)
        list_text = self.convert_html_list(issues_list)
        return list_text

    @dec_check_none
    def get_orientations(self, soup: BeautifulSoup)-> list:
        orientations_html = soup.find_all('ul', id='modelsData')
        orientations_list = list(orientations_html[0].children)
        
        ##if want to return string instead
        # issues_str = issues_html[0].get_text()
        # clean_str = clean_string(issues_str)
        list_text = self.convert_html_list(orientations_list)
        return list_text

    @dec_check_none
    def get_services(self, soup: BeautifulSoup)-> list:
        services_html = soup.find_all('ul', id='servicesprovidedData')
        services_list = list(services_html[0].children)
        
        list_text = self.convert_html_list(services_list)
        return list_text

    @dec_check_none
    def get_client_ages(self, soup: BeautifulSoup) -> list:
        ages_html = soup.find_all('ul', id='agesData')
        ages_list = list(ages_html[0].children)
        
        list_text = self.convert_html_list(ages_list)
        return list_text

    @dec_check_none
    def get_professions(self, soup: BeautifulSoup) -> list:
        profs_str = soup.find('span', id='professionsDefined').get_text()
        profs_list = profs_str.split(',')
        
        return [prof.strip() for prof in profs_list]

    @dec_check_none
    def get_primary_credential(self, soup: BeautifulSoup) -> str:
        credential = soup.find('span', id='licenceinfo1').get_text()
        
        return self.clean_escapes(credential)

    @dec_check_none
    def get_license_status(self, soup: BeautifulSoup) -> str:
        license_status = soup.find('span', id='license_status_id').get_text()
        
        return self.clean_escapes(license_status)

    @dec_check_none
    def get_website(self, soup: BeautifulSoup) -> str:
        try:
            website = soup.find('a', id='edit_website')['href']
        except:
            website = 'None'
        return website

    def get_address(self, soup: BeautifulSoup) -> dict:
        #office = soup.find('div', id='editOffice1')
        address = {}
        
        address['street'] = self.sub_get_street(soup)
        address['city'] = self.sub_get_city(soup)
        address['state'] = self.sub_get_state(soup)
        address['zip'] = self.sub_get_zip(soup)
        
        return address

    @dec_check_none
    def sub_get_street(self, soup: BeautifulSoup) -> str:
            return soup.find('span', itemprop='streetAddress').get_text()

    @dec_check_none      
    def sub_get_city(self, soup: BeautifulSoup) -> str:
        return soup.find('span', itemprop='addressLocality').get_text()

    @dec_check_none
    def sub_get_state(self, soup: BeautifulSoup) -> str:
        return soup.find('span', itemprop='addressRegion').get_text()

    @dec_check_none            
    def sub_get_zip(self, soup: BeautifulSoup) -> str:
        return soup.find('span', itemprop='postalCode').get_text()

    @dec_check_none
    def get_phone(self, soup: BeautifulSoup) -> str:
        phone  =soup.find('span', {'class':'profilePhone'}).text
        #phone = soup.find('span', class='profilePhone').contents[1].contents[0].get_text()

        return self.clean_string(phone)

    @dec_check_none
    def get_verification(self, soup: BeautifulSoup) -> bool:
        verified  = soup.find('div', {'class':'profileVer'}).text

        return self.clean_string(verified) == 'Verified'

In [3]:
start_urls = 'https://www.goodtherapy.org/therapists/profile/jessica-fern-cooley-20170717'
good_scraper = GoodTherapySoupScraper(starting_url=start_urls)

In [4]:
soup = good_scraper.get_soup()

In [5]:
profs = good_scraper.get_professions(soup)
profs

['Psychotherapist']

In [6]:
issues = good_scraper.get_tx_issues(soup)
issues

['Adjusting to Change / Life Transitions',
 'Anger',
 'Attachment Issues',
 'Blended Family Issues',
 'Breakup',
 'Communication Problems',
 'Creative Blocks',
 'Emotional Intelligence',
 'Family of Origin Issues',
 'Forgiveness',
 'Grief, Loss, and Bereavement',
 'Identity Issues',
 'Individuation',
 'LGBT (Lesbian, Gay, Bisexual, and Transgender) Issues',
 'Life Purpose / Meaning / Inner-Guidance',
 'Perfectionism',
 'Polyamory / Nonmonogamous Relationships',
 'Relationships and Marriage',
 'Self-Actualization',
 'Self-Compassion',
 'Self-Doubt',
 'Self-Love',
 'Shame',
 'Spirituality',
 'Trust Issues',
 'Values Clarification',
 "Women's Issues"]

In [7]:
services = good_scraper.get_services(soup)
services

['Coaching',
 'Consultation',
 'Individual Therapy & Counseling',
 'Marriage, Couples, or Relationship Counseling',
 'Mediation',
 'Telehealth']

In [8]:
groups = good_scraper.get_client_ages(soup)
groups

['Adults']

In [12]:
data_dict = good_scraper.get_all_data(soup)

In [18]:
data_dict

{'name': 'Jessica Fern Cooley',
 'writing_sample': "I work with people are looking to feel connected to themselves and others, no longer limited by the patterning, conditioning, stories, triggers, attachment styles and past pains that have been interfering with who and how you want to be. The foundation of my therapy and coaching is to create a safe, non-judgemental space for you to explore, heal and grow so that you can live your preferred expressions of self.  Our work together is an opportunity for you to upgrade your emotional and relational operating systems so that you can live more fully in the embodiment of who and how you want to be in life and in love.  I integrate different therapeutic approaches to assist you in the growth and changes you seek and my practice is Queer and Poly Friendly.\r For people in relationships, I know how easy it is to get triggered, stuck in old patterns of relating, and caught in cycles of reactivity with the ones we love the most. However, the chal

In [17]:
for k, v in data_dict.items():
    print(f'{k.upper()} : {v}')

NAME : Jessica Fern Cooley
WRITING_SAMPLE : I work with people are looking to feel connected to themselves and others, no longer limited by the patterning, conditioning, stories, triggers, attachment styles and past pains that have been interfering with who and how you want to be. The foundation of my therapy and coaching is to create a safe, non-judgemental space for you to explore, heal and grow so that you can live your preferred expressions of self.  Our work together is an opportunity for you to upgrade your emotional and relational operating systems so that you can live more fully in the embodiment of who and how you want to be in life and in love.  I integrate different therapeutic approaches to assist you in the growth and changes you seek and my practice is Queer and Poly Friendly. For people in relationships, I know how easy it is to get triggered, stuck in old patterns of relating, and caught in cycles of reactivity with the ones we love the most. However, the challenges an

In [None]:
for t in name:
    print(type(t))
    print(t.get_text())

In [None]:
good_scraper.get_all_data(soup)

In [None]:
escape_chars = ['/','\n','/n','\r']

def get_soup(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    soup.prettify()
    return soup

def clean_string(string: str) -> str:
    string = clean_escapes(string)
    #string = clean_punctuation(string)
    
    return string.strip()

def clean_escapes(string: str):
    for esc in escape_chars:
        string = string.replace(esc,'')
        #string = string.replace('/n','')
        #string = string.replace('\n','')
        #string = string.replace('\r','')

    return string

def clean_punctuation(string: str, keep_emo_punc=False) -> str:
    string = string.replace('&amp;', '&')

def convert_html_list(li: list):
    clean_li = []
    for elem in li:
        if elem not in escape_chars:
            try:
                string = ''
                if isinstance(elem, str):
                    string = elem.strip()
                else:
                    string = elem.text.strip()
                
                #check if empty
                if string:
                    clean_li.append(string)
            except Exception as e:
                print(f'Error caught: {e}')
                continue
            
    return clean_li
    #return [tag.text for tag in li if type(tag) == 'li']
    
def dec_check_none(func):
    def func_wrapper(name):
        try:
            val = func(name)
            return val
        except Exception as e:
            if isinstance(e, AttributeError):
                return None
    return func_wrapper

def goodtx_get_all_data(soup: BeautifulSoup) -> dict:
    all_data = {}
    all_data['name'] = goodtx_get_name(soup)
    all_data['writing_sample'] = goodtx_get_writing_sample(soup)
    issues = goodtx_get_tx_issues(soup)
    all_data['issues'] = convert_html_list(issues)
    orientations = goodtx_get_orientations(soup)
    all_data['orientations'] = convert_html_list(orientations)
    services = goodtx_get_services(soup)
    all_data['services'] = convert_html_list(services)
    age_groups = goodtx_get_client_ages(soup)
    all_data['ages'] = convert_html_list(age_groups)
    professions = goodtx_get_professions(soup)
    all_data['professions'] = convert_html_list(professions)
    all_data['credential'] = goodtx_get_primary_credential(soup)
    all_data['license_status'] = goodtx_get_license_status(soup)
    all_data['website'] = goodtx_get_website(soup)
    all_data['address'] = goodtx_get_address(soup)
    all_data['phone'] = goodtx_get_phone(soup)
    all_data['verified'] = goodtx_get_verification(soup)
    
    return all_data

@dec_check_none
def goodtx_get_name(soup: BeautifulSoup) -> str:
    name = soup.find('h1', id='profileTitle_id').contents[1].get_text()
    return clean_escapes(name)

@dec_check_none
def goodtx_get_writing_sample(soup: BeautifulSoup) -> str:
    desc = soup.find_all('div', class_='profileBottomLeft')
    all_text = desc[0].find_all('div', class_='text')
    good_stuff = []
    for txt in all_text:
        for child in txt.children:
            if(child.name == 'p'):
                good_stuff.append(child.get_text())

    good_stuff_st = ''.join(good_stuff)
    return good_stuff_st

@dec_check_none
def goodtx_get_tx_issues(soup: BeautifulSoup)-> list:
    issues_html = soup.find_all('ul', id='issuesData')
    issues_list = list(issues_html[0].children)
    
    ##if want to return string instead
    # issues_str = issues_html[0].get_text()
    # clean_str = clean_string(issues_str)
    
    return issues_list

@dec_check_none
def goodtx_get_orientations(soup: BeautifulSoup)-> list:
    orientations_html = soup.find_all('ul', id='modelsData')
    orientations_list = list(orientations_html[0].children)
    
    ##if want to return string instead
    # issues_str = issues_html[0].get_text()
    # clean_str = clean_string(issues_str)
    
    return orientations_list

@dec_check_none
def goodtx_get_services(soup: BeautifulSoup)-> list:
    services_html = soup.find_all('ul', id='servicesprovidedData')
    services_list = list(services_html[0].children)
    
    return services_list

@dec_check_none
def goodtx_get_client_ages(soup: BeautifulSoup) -> list:
    ages_html = soup.find_all('ul', id='agesData')
    ages_list = list(ages_html[0].children)
    
    return ages_list

@dec_check_none
def goodtx_get_professions(soup: BeautifulSoup) -> list:
    profs_str = soup.find('span', id='professionsDefined').get_text()
    profs_list = profs_str.split(',')
    
    return [prof.strip() for prof in profs_list]

@dec_check_none
def goodtx_get_primary_credential(soup: BeautifulSoup) -> str:
    credential = credential_type_str = soup.find('span', id='licenceinfo1').get_text()
    
    return clean_escapes(credential)

@dec_check_none
def goodtx_get_license_status(soup: BeautifulSoup) -> str:
    license_status = soup.find('span', id='license_status_id').get_text()
    
    return clean_escapes(license_status)

@dec_check_none
def goodtx_get_website(soup: BeautifulSoup) -> str:
    try:
        website = soup.find('a', id='edit_website')['href']
    except:
        website = 'None'
    return website

def goodtx_get_address(soup: BeautifulSoup) -> dict:
    #office = soup.find('div', id='editOffice1')
    address = {}
    
    address['street'] = sub_get_street(soup)
    address['city'] = sub_get_city(soup)
    address['state'] = sub_get_state(soup)
    address['zip'] = sub_get_zip(soup)
    
    return address

@dec_check_none
def sub_get_street(soup: BeautifulSoup) -> str:
        return soup.find('span', itemprop='streetAddress').get_text()

@dec_check_none      
def sub_get_city(soup: BeautifulSoup) -> str:
    return soup.find('span', itemprop='addressLocality').get_text()

@dec_check_none
def sub_get_state(soup: BeautifulSoup) -> str:
    return soup.find('span', itemprop='addressRegion').get_text()

@dec_check_none            
def sub_get_zip(soup: BeautifulSoup) -> str:
    return soup.find('span', itemprop='postalCode').get_text()

@dec_check_none
def goodtx_get_phone(soup: BeautifulSoup) -> str:
    phone  =soup.find('span', {'class':'profilePhone'}).text
    #phone = soup.find('span', class='profilePhone').contents[1].contents[0].get_text()

    return clean_string(phone)

@dec_check_none
def goodtx_get_verification(soup: BeautifulSoup) -> bool:
    verified  = soup.find('div', {'class':'profileVer'}).text

    return clean_string(verified) == 'Verified'

@dec_check_none
def goodtx_get_verification(soup: BeautifulSoup) -> bool:
    verified  = soup.find('div', {'class':'profileVer'}).text

    return clean_string(verified) == 'Verified'

def dec_check_none(func):
    def func_wrapper(name):
        try:
            val = func(name)
            return val
        except Exception as e:
            if isinstance(e, AttributeError):
                return None
    return func_wrapper


In [None]:
#clean_string(soup.find('div', {'class':'profileVer'}).text)
goodtx_get_verification(soup)

In [None]:
goodtx_get_address(soup)

In [None]:
li_test = goodtx_get_professions(soup)
li_test

In [None]:
result = convert_html_list(li_test)
result

In [None]:
#if __name__ == '__main__':
#url = 'https://www.goodtherapy.org/therapists/profile/clare-comstock-20170911'
#url = 'https://www.goodtherapy.org/therapists/profile/morgan-dingle-20191203'
url = 'https://www.goodtherapy.org/therapists/profile/jessica-fern-cooley-20170717'
#therapists = ['https://www.goodtherapy.org/therapists/profile/andrea-risi-20130730']
#url = 'http://www.inbetweentherapy.com'
soup = get_soup(url = url)


#soup_no_website = get_soup('https://www.goodtherapy.org/therapists/profile/julie-reichenberger-20200327')

In [None]:
listing_page_url = 'https://www.goodtherapy.org/search2.html?search%5Btherapist_search%5D=Find+a+Therapist&search%5Bstate%5D=&search%5Bzipcode%5D=denver%2C+colorado&search%5Blat%5D=39.7392358&search%5Blon%5D=-104.990251&search%5Bmiles%5D=25&search%5Bcity_log%5D=Denver&search%5Bcity_log_short%5D=Denver&search%5Bstate_log%5D=Colorado&search%5Bstate_log_short%5D=CO&search%5Bcountry_log%5D=United+States&search%5Bcountry_log_short%5D=US&TOS_agreement=P&fromheader=1'

soup_listing = get_soup(url = listing_page_url)

In [None]:
print(soup_listing.find_all('div', {'class' : 'therapist_middle_section'}))

In [None]:
for h in list(soup.find_all('h2')):
    print(h.get_text())

In [None]:
therapist_info = goodtx_get_all_data(soup)
therapist_info

In [None]:
profs_str = soup.find('span', id='professionsDefined').text
profs_list = profs_str.split(',')
profs_list

In [None]:
soup.find('span', itemprop='telephone').contents[1].contents[0]

In [None]:
soup.find('span', {'class':'profilePhone'}).text

In [None]:
goodtx_get_phone(soup)