In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import json
from random import choice, randint
import time
import re
from urllib.parse import urljoin

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
    "Mozilla/5.0 (iPad; CPU OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 Mobile/14G60 Safari/602.1",
]

def get_headers():
    return {'User-Agent': choice(user_agents)}

def make_request(url, max_retries=5):
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    headers = get_headers()
    try:
        response = session.get(url, headers=headers)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(f"Error for URL {url}: {e}")
        return None

def load_scraped_links(filename="scraped_hrefs.json"):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return json.load(file)
    except FileNotFoundError:
        return []

def save_scraped_link(link, filename="scraped_hrefs.json"):
    scraped_links = load_scraped_links(filename)
    if link not in scraped_links:
        scraped_links.append(link)
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(scraped_links, file, ensure_ascii=False, indent=4)

def preprocess_telugu_content(content):
    telugu_letters = re.findall(r'[ఁ-౿]+', content)
    return ' '.join(telugu_letters)

def append_article(article, category):
    filename_articles = f"{category}_articles.json"
    articles = []
    try:
        with open(filename_articles, 'r', encoding='utf-8') as file:
            articles = json.load(file)
    except FileNotFoundError:
        pass
    
    telugu_content = preprocess_telugu_content(article['content'])
    
    # Update the article dictionary with the Telugu content
    article['content'] = telugu_content
    
    articles.append(article)
    with open(filename_articles, 'w', encoding='utf-8') as file:
        json.dump(articles, file, ensure_ascii=False, indent=4)  # Beautify JSON output

def get_content_with_newlines(content):
    # Replace multiple spaces with a single space and preserve newlines
    content = re.sub(r'\s+', ' ', content)
    return content

def get_selector(category):
    selectors = {
        'latest_news': 'figure > figcaption > p',
        'national': 'figure > figcaption > p',
        'politics': 'figure > figcaption > p',
        'mukhyaamshalu': 'figure > figcaption > p',
        'elections': 'figure > figcaption > p',
        'andhra_pradesh': 'figure > figcaption > p',
        'telangana': 'figure > figcaption > p',
        'sports': 'figure > figcaption > p',
        'editorial': 'figure > figcaption > p',
        'business': 'figure > figcaption > p'
    }
    return selectors.get(category, '')

def scrape_page(url, category):
    response = make_request(url)
    if response and response.status_code == 200:
        print(f"Successfully retrieved page: {url}")
        soup = BeautifulSoup(response.content, 'html.parser')
        article_blocks = soup.select('body > div.container-fluid > div > div.categoryPage_Wrapper > div.leftSidebar > div > div.category_content > div.category_content_listing > figure > figcaption > h3 > a')
        content_selector = get_selector(category)
        
        print(f"Found {len(article_blocks)} articles on page: {url}")
        
        articles_found = False
        for a_tag in article_blocks:
            href = a_tag['href']
            full_href = urljoin(url, href)  # Handle relative and absolute URLs
            title = a_tag.text.strip()
            
            article_response = make_request(full_href)
            if article_response and article_response.status_code == 200:
                article_soup = BeautifulSoup(article_response.content, 'html.parser')
                content_element = article_soup.select_one(content_selector)
                if content_element:
                    content = content_element.text.strip()
                    content = get_content_with_newlines(content)  # Preserve newlines
                else:
                    # Try an alternative approach to find content
                    content_elements = article_soup.select('p')
                    content = '\n'.join([elem.get_text(strip=True) for elem in content_elements]) if content_elements else 'Content not found'
                
                print(f"Found article: {title} - {full_href}")
                article = {'title': title, 'href': full_href, 'content': content, 'category': category}
                save_scraped_link(full_href)
                append_article(article, category)
                articles_found = True
            else:
                print(f"Failed to retrieve article page: {full_href}")

        return articles_found
    else:
        print(f"Failed to retrieve page: {url}")
        return False

def scrape_all_pages(base_url, category):
    page = 1
    while True:
        url = base_url if page == 1 else f"{base_url}/page/{page}/"
        print(f"Scraping page: {url}")
        articles_found = scrape_page(url, category)
        if not articles_found:
            print(f"No more articles found for {category} on page {page}.")
            break
        page += 1
        time.sleep(randint(1, 3))

if __name__ == "__main__":
    categories = [
        {'url': 'https://www.andhrajyothy.com/national', 'category': 'national'},
        {'url': 'https://www.andhrajyothy.com/latest-news', 'category': 'latest_news'},
        {'url': 'https://www.andhrajyothy.com/elections', 'category': 'elections'},
        {'url': 'https://www.andhrajyothy.com/andhra-pradesh', 'category': 'andhra_pradesh'},
        {'url': 'https://www.andhrajyothy.com/telangana', 'category': 'telangana'},
        {'url': 'https://www.andhrajyothy.com/sports', 'category': 'sports'},
        {'url': 'https://www.andhrajyothy.com/editorial', 'category': 'editorial'},
        {'url': 'https://www.andhrajyothy.com/business', 'category': 'business'},
        {'url': 'https://www.andhrajyothy.com/politics', 'category': 'politics'},
        {'url': 'https://www.andhrajyothy.com/mukhyaamshalu', 'category': 'mukhyaamshalu'},  # New category
    ]

    for category in categories:
        print(f"Scraping category: {category['category']} from {category['url']}")
        scrape_all_pages(category['url'], category['category'])


Scraping category: national from https://www.andhrajyothy.com/national
Scraping page: https://www.andhrajyothy.com/national
Successfully retrieved page: https://www.andhrajyothy.com/national
Found 10 articles on page: https://www.andhrajyothy.com/national
Found article: Rahul Gandhi: మణిపూర్‌లో పర్యటించనున్న రాహుల్.. ఎప్పుడంటే..? - https://www.andhrajyothy.com/2024/national/rahul-gandhi-to-visit-manipur-on-july-8-avr-1278367.html
Found article: Building collapse: గుజరాత్‌లో మరో ఉపద్రవం, కుప్పకూలిన ఆరంతస్తుల భవనం - https://www.andhrajyothy.com/2024/national/six-storey-building-collapses-in-surat-several-feared-trapped-avr-1278360.html
Found article: Sunita Kejriwal: ఎంపీ తప్పుడు స్టే‌ట్‌మెంట్‌తోనే నా భర్త అరెస్టు.. వీడియో రిలీజ్ చేసిన సునీత కేజ్రీవాల్ - https://www.andhrajyothy.com/2024/national/ed-implicated-witness-son-to-extract-false-statement-says-sunita-kejriwal-avr-1278354.html
Found article: Encounter: జమ్మూకశ్మీర్‌లో ఎన్‌కౌంటర్: సైనికుడు మృతి - https://www.andhrajyothy.com/2024