In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

def scrape_tagpage(url, start_index):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')
    
    data = []
    
    for article_index, article in enumerate(soup.find_all('article'), start=start_index):
        tag_title = article.find('h1', class_='entry-title').text.strip()
        tag_link = article.find('a')['href']
        
        print("Title:", tag_title)
        print("Link:", tag_link)
        
        data.append({
            'Index': article_index, 
            'Title': tag_title, 
            'Link': tag_link
        })
        
    return data

def scrape_articles(links_data):
    all_articles_data = []
    
    for link_data in links_data:
        link = link_data['Link']
        html_text = requests.get(link)
        soup2 = BeautifulSoup(html_text.content, 'lxml')
        
        title = soup2.find('header', class_='entry-header-site-hero').text.strip() if soup2.find('header', class_='entry-header-site-hero') else 'NaN'
        publisheddate = soup2.find('time', class_='entry-date published').text if soup2.find('time', class_='entry-date published') else 'NaN'
        imgurl = soup2.find('figure', class_='wp-caption aligncenter').img['src'] if soup2.find('figure', class_='wp-caption aligncenter') else 'NaN'
        content = soup2.find('div', class_='entry-content entry-content-single').text if soup2.find('div', class_='entry-content entry-content-single') else 'NaN'
        html_content = soup2.find('div', class_='entry-content entry-content-single') if soup2.find('div', class_='entry-content entry-content-single') else 'NaN'
        
        article_data = {
            'Index': link_data['Index'],
            'Title': title,
            'Link':link_data['Link'],
            'Published Date': publisheddate,
            'Image URL': imgurl,
            'Content': content,
            'HTML Content': html_content,
            'Data Fetched': datetime.now().strftime('%d-%m-%Y'),
        }
        
        all_articles_data.append(article_data)
        
    return all_articles_data

base_urls = [
     'https://thinkingautismguide.com/'  
]

num_pages = 3

all_data = []
current_index = 1

for base_url in base_urls:
    for page_number in range(1, num_pages + 1):  # Iterate from 1 to num_pages
        url = f'{base_url}page/{page_number}/'
        page_data = scrape_tagpage(url, current_index)  
        
        if page_data:
            all_data.extend(page_data)
            current_index += len(page_data)  # Increment the index appropriately

# Create a DataFrame from the scraped data
df_posts = pd.DataFrame(all_data)

# Scraping article details
df_articles = pd.DataFrame(scrape_articles(df_posts.to_dict(orient='records')))

print("Posts DataFrame:")
print(df_posts)

print("\nArticles DataFrame:")
print(df_articles)

Title: The Joy of a Sensory Friendly Home Life
Link: https://thinkingautismguide.com/2024/03/the-joy-of-a-sensory-friendly-home-life.html
Title: 30 Sensory Icks: A Checklist For Autistic and Neurodivergent People
Link: https://thinkingautismguide.com/2024/02/30-sensory-icks-a-checklist-for-autistic-and-neurodivergent-people.html
Title: Educators: How Understanding Autistic Identities Can Help You Help Your Students
Link: https://thinkingautismguide.com/2024/02/educators-how-understanding-autistic-identities-can-help-you-help-your-students.html
Title: Can ABA Therapy Affirm Neurodiversity?
Link: https://thinkingautismguide.com/2024/02/can-aba-therapy-affirm-neurodiversity.html
Title: Unheard Voices’s Shawn Sullivan Talks Autism Advocacy With TPGA
Link: https://thinkingautismguide.com/2024/02/unheard-voicess-shawn-sullivan-talks-autism-advocacy-with-tpga.html
Title: Natalia Speaks and the Struggle for Disability Representation in Documentaries
Link: https://thinkingautismguide.com/2024/0

In [7]:
df_articles.to_csv('thinkingautismguide.csv',index=False)