In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

# Set the user agent in the headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def scrape_page(url, start_index):
    html_text = requests.get(url, headers=headers)
    soup = BeautifulSoup(html_text.content, 'lxml')
    
    data = []
    
    for article_index, article in enumerate(soup.find_all('div', class_='entry col span_9')):
        title = article.h2.text.strip()
        link = article.a['href']
        
        data.append({
            'Index': article_index + start_index,
            'Title': title,
            'Link': link,
        })
        
    return data

def scrape_articles(links_data):
    all_articles_data = []
    
    for link_data in links_data:
        try:
            link = link_data['Link']
            print(f"Scraping article from: {link}")
            html_text2 = requests.get(link, headers=headers)
            soup2 = BeautifulSoup(html_text2.content, 'lxml')
            
            articletitle = soup2.find('div',class_='entry col span_9').h1.text if soup2.find('div',class_='entry col span_9') else None
            article_publisheddate = soup2.find('div', class_='post_date col').text.strip() if soup2.find('div', class_='post_date col') else None
            articleimgurl = soup2.find('div', class_='thumb col span_3').img['src'] if soup2.find('div', class_='thumb col span_3') else None
            article_content = soup2.find('div', class_='the_content').text.strip() if soup2.find('div', class_='the_content') else None
            article_htmlcontent = soup2.find('div', class_='the_content') if soup2.find('div', class_='the_content') else None
            
            article_data = {
                'Index': link_data['Index'],
                'Title': articletitle,
                'Link': link_data['Link'],
                'Date': article_publisheddate,
                'Image URL': articleimgurl,
                'Content': article_content,
                'HTML Content': article_htmlcontent,
                'Data Fetched': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            }
            
            all_articles_data.append(article_data)
        except Exception as e:
            print(f"Error scraping article from {link_data['Link']}: {str(e)}")

    return all_articles_data

# base url
base_url = 'https://autismawarenesscentre.com/autism-news/'
# number of pages
num_pages = 2

# scraping
all_data = []
current_index = 1

for page_number in range(1, num_pages + 1):
    url = f'{base_url}page/{page_number}/'
    page_data = scrape_page(url, current_index)
    
    if page_data:
        all_data.extend(page_data)
        current_index += len(page_data)

# dataframes
df_posts = pd.DataFrame(all_data)

df_articles = pd.DataFrame(scrape_articles(df_posts.to_dict(orient='records')))

# Print DataFrames
print("Posts DataFrame:")
print(df_posts)

print("\nArticles DataFrame:")
print(df_articles)

Scraping article from: https://autismawarenesscentre.com/how-can-we-develop-a-better-understanding-of-behaviors-of-concern/
Scraping article from: https://autismawarenesscentre.com/the-role-of-exercise-in-stress-reduction/
Scraping article from: https://autismawarenesscentre.com/what-is-the-whole-school-saturation-model-and-how-can-it-improve-outcomes-for-autistic-students-in-inclusive-settings/
Scraping article from: https://autismawarenesscentre.com/mealtimes-eating-difficulties-and-the-autism-spectrum/
Scraping article from: https://autismawarenesscentre.com/bullying-and-autism-how-we-can-help/
Scraping article from: https://autismawarenesscentre.com/sweet-dreams-autism-and-sleep/
Scraping article from: https://autismawarenesscentre.com/christmas-with-autism-hold-the-expectations/
Scraping article from: https://autismawarenesscentre.com/what-accommodations-are-helpful-for-an-autistic-student/
Scraping article from: https://autismawarenesscentre.com/transitioning-from-the-family-home

In [2]:
df_articles.to_csv('AutismAwareness.csv', index = False)