<a href="https://colab.research.google.com/github/aanjalii2/Stock/blob/main/Headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
from bs4 import BeautifulSoup
import time
import random
import pandas as pd

def scrape_ratopati_headlines(num_headlines_needed=750, max_pages_to_check=90):
    base_url = "https://www.ratopati.com/category/share-market"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }

    unique_headlines = set()
    page_num = 1

    print("Starting to scrape Ratopati headlines...\n")

    while len(unique_headlines) < num_headlines_needed and page_num <= max_pages_to_check:
        if page_num == 1:
            url = base_url
        else:
            url = f"{base_url}?page={page_num}"  # ← fixed this line

        print(f"Scraping URL: {url} (Page {page_num})")

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Collect headline tags
            headlines_on_page = soup.find_all(['h2', 'h3'], class_=['title', 'news-title', 'post-title'])
            article_links = soup.find_all('a', class_=['post-link', 'read-more-link'])

            # Extract from <h2>/<h3>
            for element in headlines_on_page:
                headline_text = element.get_text(strip=True)
                if headline_text and len(headline_text) > 10:
                    unique_headlines.add(headline_text)

            # Extract from <a> tags too
            for link in article_links:
                headline_text = link.get_text(strip=True)
                if headline_text and len(headline_text) > 10:
                    unique_headlines.add(headline_text)

                # Try inner headline tag
                child_headline = link.find(['h2', 'h3'])
                if child_headline:
                    child_text = child_headline.get_text(strip=True)
                    if child_text and len(child_text) > 10:
                        unique_headlines.add(child_text)

            print(f"→ Total collected so far: {len(unique_headlines)} headlines")

            page_num += 1
            time.sleep(random.uniform(1, 3))

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
            print("Retrying after a short delay...\n")
            time.sleep(5)
            continue

    return list(unique_headlines)


if __name__ == "__main__":
    headlines = scrape_ratopati_headlines(num_headlines_needed=750, max_pages_to_check=90)

    print(f"\n✅ Scraping complete. Total unique headlines collected: {len(headlines)}")
    for i, headline in enumerate(headlines[:10], 1):
        print(f"{i}. {headline}")

    # Save to text
    with open("ratopati_headlines.txt", "w", encoding="utf-8") as f:
        for headline in headlines:
            f.write(headline + "\n")
    print("\n📁 Headlines saved to ratopati_headlines.txt")

    # Also save to CSV
    df = pd.DataFrame({'headline': headlines})
    df.to_csv("ratopati_headlines.csv", index=False, encoding="utf-8-sig")
    print("📁 Headlines saved to ratopati_headlines.csv")


Starting to scrape Ratopati headlines...

Scraping URL: https://www.ratopati.com/category/share-market (Page 1)
→ Total collected so far: 83 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=2 (Page 2)
→ Total collected so far: 123 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=3 (Page 3)
→ Total collected so far: 164 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=4 (Page 4)
→ Total collected so far: 204 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=5 (Page 5)
→ Total collected so far: 245 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=6 (Page 6)
→ Total collected so far: 286 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=7 (Page 7)
→ Total collected so far: 327 headlines
Scraping URL: https://www.ratopati.com/category/share-market?page=8 (Page 8)
→ Total collected so far: 368 headlines
Scraping URL: https://www.rato

In [4]:
from google.colab import files
files.download("ratopati_headlines.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>