In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

def scrape_autismspeaks_news():
    html_text = requests.get('https://www.autismspeaks.org/news?article_type[981]=981&life_stage[1041]=1041&article_type[981]=981')
    soup = BeautifulSoup(html_text.content, 'lxml')

    autismspeaks = soup.find_all('div', class_='col-12 col-md-3 grid__col')

    posts_data = []

    for index, post in enumerate(autismspeaks, start=1):
        as_title = post.find('div', class_='card__summary').text.strip()
        as_link = post.find('div', class_='card__summary').a['href']
        link = 'https://www.autismspeaks.org/' + as_link

        post_data = {
            'Index': index,
            'Title': as_title,
            'Link': link
        }

        posts_data.append(post_data)

    posts_df = pd.DataFrame(posts_data)
    return posts_df

def scrape_articles(url):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')

    article = soup.find('div', class_='article-page')

    article_title = article.find('h1').text.strip() if article.find('h1') else None

    article_publisheddate = article.find('strong', class_='article-page__author').text.strip() if article.find('strong', class_='article-page__author') else None

    article_imgurl = article.find('div', class_='article-page__inner').img['src'] if article.find('div', class_='article-page__inner') and article.find('div', class_='article-page__inner').img else None
    imgurl = 'https://www.autismspeaks.org/' + article_imgurl if article_imgurl else None

    content_text = ""
    content_html = ""

    content_div = article.find('div', class_='article-page__inner')
    if content_div:
        for p_tag in content_div.find_all('p'):
            content_text += p_tag.text.strip() + "\n"
            content_html += str(p_tag) + "\n"

    article_data = {
        'Index': None,  # Placeholder for the index
        'Title': article_title,
        'Link': url,
        'Published date': article_publisheddate,
        'Image URL': imgurl,
        'Text content': content_text,
        'HTML content': content_html,
        'Date Fetched': pd.to_datetime('today').date()
    }

    return article_data

# Scrape autismspeaks news and print posts_df
posts_df = scrape_autismspeaks_news()
print("Posts DataFrame:")
print(posts_df)
print("\n")

# Scrape articles and print articles_df
articles_data = []

for index, row in posts_df.iterrows():
    article_data = scrape_articles(row['Link'])
    article_data['Index'] = row['Index']
    articles_data.append(article_data)

articles_df = pd.DataFrame(articles_data)
print("Articles DataFrame:")
print(articles_df)

Posts DataFrame:
    Index                                              Title  \
0       1  NFL players show support of Autism Speaks duri...   
1       2  Autism Speaks CEO Stands with Chicago Bears Of...   
2       3  Charting a Path to Inclusion: A Call to Action...   
3       4  A Letter from Autism Speaks CEO this World Aut...   
4       5  Thomas Frazier joins the Autism Speaks Board o...   
5       6  Autism Speaks announces 2021 Norma and Malcom ...   
6       7  Autism Speaks applauds introduction of the Sup...   
7       8  How to cope with disrupted family routines dur...   
8       9        Join Us for a Brighter Life on the Spectrum   
9      10  Autism study finds high rates of unmet healthc...   
10     11  Autism Speaks funds promising new treatment an...   
11     12  Autism Research Series – ‘Discovery to Solutions’   
12     13  National survey of parents identifies 1 in 40 ...   
13     14  Autism Speaks and Royal Arch Masons expand fun...   
14     15  This hospita

In [3]:
articles_df.to_csv('AutismSpeaks.csv', index = False)