In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

def scrape_neuroclastic(url, start_index):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content,'lxml')

    nc = soup.find('div', class_='elementor elementor-21046')

    data = []
    
    for index, article in enumerate(nc.find_all('article'), start=start_index):
        nc_title_tag = article.find('h3', class_='elementor-post__title')
        if not nc_title_tag:
            nc_title_tag = article.find('h4', class_='elementor-post__title')

        if nc_title_tag:
            nc_title = nc_title_tag.text.strip()
            nc_link = nc_title_tag.a['href']

            data.append({
                'Index': index, 
                'Title': nc_title, 
                'Link': nc_link
            })
    
    return data

def scrape_articles(links_data):
    all_articles_data = []
    
    for link_data in links_data:
        link = link_data['Link']
        html_text = requests.get(link)
        soup2 = BeautifulSoup(html_text.content,'lxml')
        
        title = soup2.find('div',class_='elementor-element elementor-element-c378c0b elementor-widget elementor-widget-theme-post-title elementor-page-title elementor-widget-heading').text.strip()
        publisheddate= soup2.find('span',class_='elementor-icon-list-text elementor-post-info__item elementor-post-info__item--type-date').text.strip()
        imgurl = soup2.find('div',class_='elementor-element elementor-element-3ebbe3a elementor-widget elementor-widget-theme-post-featured-image elementor-widget-image').img['src']
        content = soup2.find('div',class_='elementor-element elementor-element-5de30ff elementor-widget elementor-widget-theme-post-content').text.strip()
        htmlcontent = soup2.find('div',class_='elementor-element elementor-element-5de30ff elementor-widget elementor-widget-theme-post-content')
        
        article_data ={
            'Index': link_data['Index'],
            'Title': title,
            'Published Date': publisheddate,
            'Image URL': imgurl,
            'Content': content,
            'HTML Content': htmlcontent,
            'Data Fetched': datetime.now().strftime('%d-%m-%Y'),
        }
        
        all_articles_data.append(article_data)
        
    return all_articles_data

base_urls = [
     'https://neuroclastic.com/autism/'    
]

num_pages = 5

all_data = []
current_index = 1

for base_url in base_urls:
    for page_number in range(1, num_pages + 1):  
        url = f'{base_url}page/{page_number}/'
        page_data = scrape_neuroclastic(url, current_index)
        
        if page_data:
            all_data.extend(page_data)
            current_index += len(page_data)  
    
# Create a DataFrame from the scraped data
df_posts = pd.DataFrame(all_data)

# Scraping article details
df_articles = pd.DataFrame(scrape_articles(df_posts.to_dict(orient='records')))

# Print DataFrames
print("Posts DataFrame:")
print(df_posts)

print("\nArticles DataFrame:")
print(df_articles)

Posts DataFrame:
     Index                                              Title  \
0        1  Celebrate the diversity of humankind – Embrace...   
1        2  Celebrate the diversity of humankind – Embrace...   
2        3  Celebrate the diversity of humankind – Embrace...   
3        4  Celebrating the infinitely diverse ways of bei...   
4        5  Life is relational and beyond human comprehension   
..     ...                                                ...   
120    121                       Healing from Autistic trauma   
121    122  What autistics mean when we say this world is ...   
122    123        Convergent and divergent cultural evolution   
123    124  Book Review: What I Mean When I Say I’m Autist...   
124    125  Resolution for Inclusion: Autistics need deep,...   

                                                  Link  
0    https://neuroclastic.com/celebrate-the-diversi...  
1    https://neuroclastic.com/celebrate-the-diversi...  
2    https://neuroclastic.com/c

In [2]:
df_articles.to_csv('neuroclastic.csv',index=False)