In [3]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_seattlechildren_news():
    base_url = 'https://www.seattlechildrens.org'
    html_text = requests.get(base_url + '/clinics/autism-center/the-autism-blog/')
    soup = BeautifulSoup(html_text.content, 'lxml')
    
    sc = soup.find('div', class_='block')
    
    posts_data = []
    
    for index, item in enumerate(sc.find_all('li'), start=1):
        if index > 20:
            break
        
        sc_title_element = item.find('p', class_='body-large')
        if sc_title_element:
            sc_title = sc_title_element.text.strip()
            link = item.find('a')['href']
            
            # Check if link is a relative URL
            if not link.startswith('http'):
                link = base_url + link
            
            post_data = {
                'Index': index,
                'Title': sc_title,
                'Link': link,
            }
            
            posts_data.append(post_data)
        else:
            print(f"No title found for post {index}")
        
    posts_df = pd.DataFrame(posts_data)
    return posts_df


def scrape_articles(url):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')
    
    title = soup.find('div', class_='hd').h1.text
    date = soup.find('div', class_='bd').p.text
    
    # Error handling for missing img tag
    img_tag = soup.find('div', class_='rich-text-wrapper').img
    imgurl = 'N/A' if img_tag is None else 'https://www.seattlechildrens.org/' + img_tag['src']
    
    content_tag = soup.find('div', class_='main-content-body')
    content = content_tag.text.strip() if content_tag else 'N/A'
    
    html_content = content_tag if content_tag else 'N/A'
    
    article_data = {
        'Title': title,
        'Link': url,
        'Published date': date,
        'Image URL': imgurl,
        'Text content': content,
        'HTML Content': html_content,
        'Date Fetched': pd.to_datetime('today').date()
    }
    
    return article_data

# Scrape Seattle Children's news
posts_df = scrape_seattlechildren_news()
print("Posts DataFrame:")
print(posts_df)
print("\n")

articles_data = []

for index, row in posts_df.iterrows():
    if index >= 20:
        break
    
    article_data = scrape_articles(row['Link'])
    article_data['Index'] = row['Index']
    articles_data.append(article_data)
    
articles_df = pd.DataFrame(articles_data)

articles_df = articles_df[['Index'] + [col for col in articles_df.columns if col != 'Index']]

print("Articles DataFrame:")
print(articles_df)

Posts DataFrame:
    Index                                              Title  \
0       1               10 Autism-Friendly Summer Activities   
1       2                Autism Acceptance Month: April 2023   
2       3  Naturalistic Developmental Behavioral Interven...   
3       4   Patients with Autism in the Emergency Department   
4       5                     Suggestions for Pica Treatment   
5       6    The Function of Stealing and Cognitive Rigidity   
6       7                          Autism and Theory of Mind   
7       8                                   Twins and Autism   
8       9                                      Autism and IQ   
9      10  Alternative and Augmentative Communication (AA...   
10     11                   Autism and Parent Support Groups   
11     12      Wisdom Tooth Surgery and My Child with Autism   
12     13  Obsessive Compulsive Disorder and Autism Spect...   
13     14                           Autism and Tic Disorders   
14     15  How to Reque

In [4]:
articles_df.to_csv('seattlechildren.csv',index=False)