In [None]:
import requests
from bs4 import BeautifulSoup
import calendar
import csv

base_url = "https://rategain.com/blog/"
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
})

blog_data = []

# Create a set of month names for easy checking
months = {month.lower() for month in calendar.month_name[1:]}

# Loop through pages until no more posts are found
page_num = 1
while True:
    url = f"{base_url}page/{page_num}/"
    response = session.get(url)  # Fetch the specific page URL

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        blog_posts = soup.find_all('div', class_='wrap')  # Update class to the blog post container for images

        if not blog_posts:
            print(f"No blog posts found on page {page_num}")
            break  # No more pages to scrape

        for post in blog_posts:
            image_link = post.find('a', class_='rocket-lazyload')

            if image_link and 'data-bg' in image_link.attrs:
                image_url = image_link['data-bg']
            else:
                image_url = "NaN"  # Placeholder for missing image

            blog_title = post.find_previous('div', class_='content')  # Find the corresponding title based on the structure
            if blog_title:
                title_element = blog_title.find('h6')  # Assuming blog titles are in <h6> tags
                if title_element:
                    title = title_element.text.strip()
                else:
                    title = "Title Not Found"
            else:
                title = "Title Not Found"

            blog_date = post.find('div', class_='bd-item')
            publication_date = "Date Not Found"

            if blog_date:
                date_icon = blog_date.find('i', class_='material-design-icon-history-clock-button')
                if date_icon:
                    next_element = date_icon.find_next('span')
                    if next_element and any(month in next_element.text.lower() for month in months):
                        publication_date = next_element.text.strip()
                    else:
                        publication_date = "Date Not Found"
                else:
                    publication_date = "Date Not Found"
            else:
                publication_date = "Date Not Found"
            # Extract likes count
            #likes_count = "Likes Not Found"
            # Extract likes count

            likes_tag = post.find('a', class_='zilla-likes')
            if likes_tag:
                likes_span = likes_tag.find('span')
                if likes_span:
                     span_text = likes_span.text.strip('"')

                     if span_text:
                        likes_count = span_text
                     else:
                        likes_count="no"
                else:
                     likes_count="no likes"


            blog_data.append({'Blog title': title, 'Blog image URL': image_url, 'Blog date': publication_date,'Blog likes count': likes_count})

    else:
        print(f"Failed to fetch URL: {url}")
        break

    page_num += 1

# Save data to a CSV file
csv_filename = 'blog_data.csv'
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    fieldnames = ['Blog title', 'Blog image URL', 'Blog date','Blog likes count']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()
    for data in blog_data:
        writer.writerow(data)

print(f"Total blog posts found: {len(blog_data)}")
print(f"CSV file '{csv_filename}' has been created with the scraped data.")


No blog posts found on page 46
Total blog posts found: 402
CSV file 'blog_data.csv' has been created with the scraped data.
