In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

def scrape_page(url, start_index):
    html_text = requests.get(url)
    soup = BeautifulSoup(html_text.content, 'lxml')

    data = []

    for article_index, article in enumerate(soup.find_all('div', class_='list-item-content'), start=start_index):
        title = article.find('header', class_='entry-header').h2.a.text
        link = article.find('h2', class_='entry-title').a['href']

        data.append({
            'Index': article_index,
            'Title': title,
            'Link': link,
        })

    return data

def scrape_articles(links_data):
    all_articles_data = []

    for link_data in links_data:
        link = link_data['Link']
        html_text = requests.get(link)
        soup2 = BeautifulSoup(html_text.content, 'lxml')

        addart_category = soup2.find('div', class_='entry-header__eyebrow').text.strip() if soup2.find('div', class_='entry-header__eyebrow') else 'NaN'
        addart_category_link = soup2.find('div', class_='entry-header__eyebrow').a['href'] if soup2.find('div', class_='entry-header__eyebrow') and soup2.find('div', class_='entry-header__eyebrow').a else 'NaN'
        addart_title = soup2.find('h1', class_='entry-title').text.strip() if soup2.find('h1', class_='entry-title') else 'NaN'
        addart_date = soup2.find('span', class_='entry-header__updated-on').text.strip() if soup2.find('span', class_='entry-header__updated-on') else 'NaN'
        addart_imgurl = soup2.find('div', class_='entry-thumbnail').img['src'] if soup2.find('div', class_='entry-thumbnail') else 'NaN'
        addart_content = soup2.find('div', class_='entry-content').text.strip() if soup2.find('div', class_='entry-content') else 'NaN'
        addart_htmlcontent = soup2.find('div', class_='entry-content') if soup2.find('div', class_='entry-content') else 'NaN'

        article_data = {
            'Index': link_data['Index'],
            'Category': addart_category,
            'Category Link': addart_category_link,
            'Title': addart_title,
            'Link': link_data['Link'],
            'Date': addart_date,
            'Image URL': addart_imgurl,
            'Content': addart_content,
            'HTML Content': addart_htmlcontent,
            'Data Fetched': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        }

        all_articles_data.append(article_data)

    return all_articles_data

# List of base URLs
base_urls = [
    'https://www.additudemag.com/category/adhd-add/',
    'https://www.additudemag.com/category/adhd-add/adhd-in-children/',
    'https://www.additudemag.com/category/adhd-add/adhd-in-adults/',
    'https://www.additudemag.com/category/adhd-add/adhd-in-adults/add-women/',
    'https://www.additudemag.com/category/adhd-add/symptom-tests/adhd-symptom-tests/',
    'https://www.additudemag.com/category/adhd-add/related-conditions/anxiety/',
    'https://www.additudemag.com/category/adhd-add/related-conditions/autism-spectrum-disorder/',
    'https://www.additudemag.com/category/adhd-add/related-conditions/bipolar-disorder/',
    'https://www.additudemag.com/category/adhd-add/related-conditions/depression/',
    'https://www.additudemag.com/category/adhd-add/related-conditions/oppositional-defiant-disorder/',
    'https://www.additudemag.com/category/explore-adhd-treatments/',
    'https://www.additudemag.com/category/explore-adhd-treatments/medications/',
    'https://www.additudemag.com/category/explore-adhd-treatments/treatment-reviews/',
    'https://www.additudemag.com/category/explore-adhd-treatments/natural-treatments/',
    'https://www.additudemag.com/category/explore-adhd-treatments/treating-your-child/',
    'https://www.additudemag.com/category/adhd-news/',
    'https://www.additudemag.com/category/blog/',
    'https://www.additudemag.com/category/parenting-adhd-kids/',
    'https://www.additudemag.com/category/parenting-adhd-kids/behavior-discipline/',
    'https://www.additudemag.com/category/parenting-adhd-kids/positive-parenting/',
    'https://www.additudemag.com/category/parenting-adhd-kids/teens-young-adults/',
    'https://www.additudemag.com/category/parenting-adhd-kids/organizing-your-child/',
    'https://www.additudemag.com/category/parenting-adhd-kids/schedules-routines/',
    'https://www.additudemag.com/category/adhd-add/adhd-in-children/',
]

# the number of pages to scrape for each base URL
num_pages = 3  

# Scraping data from multiple pages for each base URL
all_data = []
current_index = 1  # Initializing the index outside the loop

for base_url in base_urls:
    for page_number in range(0, num_pages + 1):
        url = f'{base_url}page/{page_number}/'
        page_data = scrape_page(url, current_index)
        
        if page_data:
            all_data.extend(page_data)
            current_index += len(page_data)  # Increment the index appropriately

# Create a DataFrame from the scraped data
df_posts = pd.DataFrame(all_data)

# Scraping article details
df_articles = pd.DataFrame(scrape_articles(df_posts.to_dict(orient='records')))

# Print DataFrames
print("Posts DataFrame:")
print(df_posts)

print("\nArticles DataFrame:")
print(df_articles)

Posts DataFrame:
      Index                                              Title  \
0         1     What Is ADHD? Signs, Types, Causes, Treatments   
1         2  What Is Inattentive ADHD? Symptoms, Characteri...   
2         3                     What Is Executive Dysfunction?   
3         4     How ADHD Ignites Rejection Sensitive Dysphoria   
4         5      ADD vs. ADHD Symptoms: What's the Difference?   
...     ...                                                ...   
1894   1895  “Cognitive Disengagement Syndrome: A Distinct ...   
1895   1896               How the DSM-5 Fails People with ADHD   
1896   1897  ADHD Diagnosis Rates Lower Among Children of I...   
1897   1898        The Future of ADHD Research Looks Like This   
1898   1899                   Vyvanse Generics Approved by FDA   

                                                   Link  
0     https://www.additudemag.com/what-is-adhd-sympt...  
1     https://www.additudemag.com/add-inattentive-ad...  
2     https://ww

In [2]:
df_articles.to_csv('addidude.csv', index = False)