In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime


def scrape_nscablog(url):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')

    data = []

    for post in soup.find_all('article'):
        ncsablog_title = post.find('h1', class_='entry-title').text.strip()
        link = 'https://www.ncsautism.org' + post.find('h1', class_='entry-title').a['href']

        data.append({
            'Title': ncsablog_title,
            'Link': link,
        })

    return data

def scrape_blog_post(links):
    all_articles_data = []

    for link_data in links:
        link = link_data['Link']
        html_text = requests.get(link)
        soup2 = BeautifulSoup(html_text.content, 'lxml')

        Title = soup2.find('h1', class_='entry-title').text.strip() if soup2.find('h1', class_='entry-title') else None
        published_date = soup2.find('time', class_='published dt-published').text.strip() if soup2.find('time', class_='published dt-published') else None
        content = soup2.find('div', class_='entry-content').text.strip() if soup2.find('div', class_='entry-content') else None
        html_content = soup2.find('div', class_='entry-content').prettify() if soup2.find('div', class_='entry-content') else None
        image = soup2.find('div', class_='sqs-image-shape-container-element has-aspect-ratio').img['src'] if soup2.find('div', class_='sqs-image-shape-container-element has-aspect-ratio') else None

        article_data = {
            'Index': link_data['Index'],
            'Title': Title,
            'Link': link_data['Link'],
            'Date': published_date,
            'Image URL': image,
            'Content': content,
            'HTML Content': html_content,
            'Date Fetched': datetime.now().strftime('%Y-%m-%d'),
        }

        all_articles_data.append(article_data)
    return all_articles_data

url = 'https://www.ncsautism.org/blog'


all_data = scrape_nscablog(url)

# Create a DataFrame from the scraped data
df_posts = pd.DataFrame(all_data)
df_posts.insert(0, 'Index', range(1, len(df_posts) + 1))  # Fixing repeating index issue

# Scraping article details
df_articles = pd.DataFrame(scrape_blog_post(df_posts.to_dict(orient='records')))

# Print DataFrames
print("Posts DataFrame:")
print(df_posts)

print("\nArticles DataFrame:")
print(df_articles)

Posts DataFrame:
   Index                                              Title  \
0      1  NCSA Comments for Federal PCPID Meeting, March...   
1      2  California's Autism Population Soars to New He...   
2      3  New Review Article—Profound Autism: An Imperat...   
3      4  NCSA Submits Concerns About the Autism CARES A...   
4      5  NIH Requests Public Comment on "Co-Occurring C...   
5      6  The Un-Jolly Reality of a Profound Autism Chri...   
6      7              Why I Need the Term "Profound Autism"   
7      8    Lost in a "Bermuda Triangle" of Profound Autism   
8      9               A New Reference Guide to I/DD Topics   
9     10  The NIH Proposes Erasing "Reducing Disability"...   

                                                Link  
0  https://www.ncsautism.org/blog//ncsa-comments-...  
1  https://www.ncsautism.org/blog//californias-au...  
2  https://www.ncsautism.org/blog//new-review-art...  
3  https://www.ncsautism.org/blog//ncsa-submits-c...  
4  https://www

In [3]:
df_articles.to_csv('nscablog.csv', index = False )