In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_ssmhealth(url):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')

    ssm = soup.find('div', class_='col col-md-8 pt-sm-0')

    if ssm:
        ssm_title = soup.find('div', class_='fr-view').text.strip()
        ssm_content = soup.find('div', class_='col').text.strip()
        link = url  # Set link as the provided URL

        return {
            'Index': None,  # Placeholder for the index
            'Title': ssm_title,
            'Content': ssm_content,
            'HTML Content': str(ssm.find('div', class_='col')),
            'Link': link
        }
    else:
        print(f"No data found for {url}")
        return None

urls = [
    'https://www.ssmhealth.com/cardinal-glennon/services/developmental-pediatrics/adhd',
    'https://www.ssmhealth.com/cardinal-glennon/services/developmental-pediatrics/autism',
    'https://www.ssmhealth.com/cardinal-glennon/services/developmental-pediatrics/developmental-delay'
]

data = []

for index, url in enumerate(urls, start=1):
    result = scrape_ssmhealth(url)
    if result:
        result['Index'] = index
        data.append(result)

df = pd.DataFrame(data)

# Reordering columns with "Index" at the beginning
df = df[['Index', 'Title', 'Content', 'HTML Content', 'Link']]

# Adding "Date Fetched" column
df['Date Fetched'] = pd.to_datetime('today').strftime('%Y-%m-%d')

print(df)

   Index                                            Title  \
0      1  Attention-Deficit/Hyperactivity Disorder (ADHD)   
1      2                             Understanding Autism   
2      3                              Developmental Delay   

                                             Content  \
0  Home\n / \nHealth Services & Programs\n / \nDe...   
1  Home\n / \nHealth Services & Programs\n / \nDe...   
2  Home\n / \nHealth Services & Programs\n / \nDe...   

                                        HTML Content  \
0  <div class="col">\n<div class="pb-3">\n<div ar...   
1  <div class="col">\n<div class="pb-3">\n<div ar...   
2  <div class="col">\n<div class="pb-3">\n<div ar...   

                                                Link Date Fetched  
0  https://www.ssmhealth.com/cardinal-glennon/ser...   2024-03-19  
1  https://www.ssmhealth.com/cardinal-glennon/ser...   2024-03-19  
2  https://www.ssmhealth.com/cardinal-glennon/ser...   2024-03-19  


In [3]:
df.to_csv('ssmhealth.csv', index = False)