In [None]:
import requests
from datetime import datetime, timedelta
import re
from urllib.parse import urlparse

# Configuration
NEWS_API_KEY = "6c4b7400256e46379249144ecd9bcb5d"  # Your provided key
SOURCES = {
    "newsapi": "https://newsapi.org/v2",
    "reuters": "https://www.reuters.com",
    "bbc": "https://www.bbc.com/news",
    "aljazeera": "https://www.aljazeera.com",
    "apnews": "https://apnews.com",
    "flipboard": "https://flipboard.com",
    "yahoo": "https://news.yahoo.com",
    "newsnow": "https://www.newsnow.co.uk",
    "allafrica": "https://allafrica.com",
    "gdelt": "https://api.gdeltproject.org/api/v2/doc/doc"
}

class NewsAggregator:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'NewsAgencyBot/1.0'})

    def get_news(self, query, country=None, category=None, date_range=None):
        """Main function to fetch and process news"""
        results = []

        # Try NewsAPI first (using your key)
        try:
            newsapi_results = self._fetch_newsapi(query, country, category, date_range)
            results.extend(newsapi_results)
        except Exception as e:
            print(f"NewsAPI error: {str(e)}")

        # Fallback to other sources if needed
        if len(results) < 3:
            try:
                google_results = self._fetch_google_news(query, country)
                results.extend(google_results)
            except Exception as e:
                print(f"Google News error: {str(e)}")

        # Remove duplicates and prioritize
        unique_results = self._deduplicate(results)
        prioritized = self._prioritize_sources(unique_results, category)

        return prioritized[:5]  # Return top 5 results

    def _fetch_newsapi(self, query, country, category, date_range):
        """Fetch from NewsAPI using your key"""
        params = {
            'apiKey': NEWS_API_KEY,
            'q': query,
            'language': 'en',
            'sortBy': 'relevancy',
            'pageSize': 10
        }

        if country:
            params['country'] = self._country_code(country)
        if category:
            params['category'] = category.lower()
        if date_range:
            params['from'] = self._format_date(date_range[0])
            params['to'] = self._format_date(date_range[1])

        response = self.session.get(f"{SOURCES['newsapi']}/everything", params=params)
        response.raise_for_status()

        articles = []
        for item in response.json().get('articles', []):
            articles.append({
                'title': item['title'],
                'url': item['url'],
                'source': item['source']['name'],
                'published': item['publishedAt'][:10],
                'credibility': self._source_credibility(item['source']['name'])
            })
        return articles

    def _fetch_google_news(self, query, country):
        """Fallback to Google News scraping"""
        params = {
            'q': query,
            'hl': 'en',
            'gl': self._country_code(country) if country else 'US',
            'ceid': f"{self._country_code(country) if country else 'US'}:en"
        }

        response = self.session.get("https://news.google.com/rss/search", params=params)
        # Note: In production, use proper RSS parsing
        # This is a simplified version
        return self._parse_rss(response.text)

    def _parse_rss(self, rss_content):
        """Parse RSS feed (simplified version)"""
        # In production, use feedparser or similar library
        articles = []
        # RSS parsing logic here
        return articles

    def _deduplicate(self, articles):
        """Remove duplicate articles"""
        seen = set()
        unique = []
        for article in articles:
            # Create a unique key from title and source
            key = (article['title'][:30], article['source'])
            if key not in seen:
                seen.add(key)
                unique.append(article)
        return unique

    def _prioritize_sources(self, articles, category):
        """Prioritize sources based on category"""
        priority_map = {
            'politics': ['Reuters', 'BBC', 'AP News', 'The Guardian'],
            'business': ['Reuters', 'Bloomberg', 'Financial Times'],
            'technology': ['TechCrunch', 'Wired', 'Ars Technica'],
            'sports': ['ESPN', 'BBC Sport', 'Sky Sports'],
            'health': ['WHO', 'CDC', 'WebMD', 'Mayo Clinic'],
            'science': ['Nature', 'Science', 'New Scientist'],
            'entertainment': ['Variety', 'Hollywood Reporter', 'Entertainment Weekly']
        }

        preferred = priority_map.get(category, [])
        return sorted(
            articles,
            key=lambda x: (preferred.index(x['source']) if x['source'] in preferred else 999, -x['credibility'])
        )

    def _source_credibility(self, source_name):
        """Assign credibility score (1-5)"""
        credibility_scores = {
            'Reuters': 5, 'BBC': 5, 'AP News': 5, 'Al Jazeera': 4,
            'The Guardian': 4, 'The New York Times': 4, 'The Washington Post': 4,
            'CNN': 3, 'Fox News': 2, 'Yahoo News': 3
        }
        return credibility_scores.get(source_name, 3)

    def _country_code(self, country_name):
        """Convert country name to ISO code"""
        country_map = {
            'united states': 'us', 'uk': 'gb', 'great britain': 'gb',
            'germany': 'de', 'france': 'fr', 'japan': 'jp',
            'india': 'in', 'china': 'cn', 'canada': 'ca',
            'australia': 'au', 'brazil': 'br', 'russia': 'ru'
        }
        return country_map.get(country_name.lower(), 'us')

    def _format_date(self, date_input):
        """Convert various date formats to YYYY-MM-DD"""
        if isinstance(date_input, datetime):
            return date_input.strftime('%Y-%m-%d')

        if date_input.lower() == 'today':
            return datetime.now().strftime('%Y-%m-%d')
        elif date_input.lower() == 'yesterday':
            return (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
        elif 'last week' in date_input.lower():
            return (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
        elif re.match(r'\d{4}-\d{2}-\d{2}', date_input):
            return date_input
        else:
            # Try to parse other formats
            try:
                return datetime.strptime(date_input, '%B %d, %Y').strftime('%Y-%m-%d')
            except:
                return datetime.now().strftime('%Y-%m-%d')

# Chatbot Interface
def news_chatbot():
    aggregator = NewsAggregator()

    print("🌍 Global News Aggregator - Type 'exit' to quit")
    print("Examples: 'politics news from Germany', 'natural disasters last week', 'vehicle industry news'")

    while True:
        user_input = input("\nWhat news would you like? ").strip()

        if user_input.lower() in ['exit', 'quit']:
            print("Goodbye!")
            break

        # Parse user query
        query_parts = {
            'query': '',
            'country': None,
            'category': None,
            'date_range': None
        }

        # Extract country
        country_match = re.search(r'from (\w+)', user_input, re.IGNORECASE)
        if country_match:
            query_parts['country'] = country_match.group(1)

        # Extract category
        categories = ['politics', 'business', 'technology', 'sports', 'health', 'science', 'entertainment', 'vehicle']
        for cat in categories:
            if cat in user_input.lower():
                query_parts['category'] = cat
                query_parts['query'] = user_input.lower().replace(cat, '').strip()
                break

        # Extract date
        date_match = re.search(r'(today|yesterday|last week|\d{4}-\d{2}-\d{2}|[A-Za-z]+ \d{1,2}, \d{4})', user_input, re.IGNORECASE)
        if date_match:
            date_str = date_match.group(1)
            query_parts['date_range'] = (date_str, date_str)

        if not query_parts['query']:
            query_parts['query'] = user_input

        # Get news
        results = aggregator.get_news(**query_parts)

        # Display results
        if not results:
            print("No news found matching your criteria. Try different keywords.")
            continue

        print(f"\n📰 Found {len(results)} articles:\n")
        for i, article in enumerate(results, 1):
            print(f"{i}. {article['title']}")
            print(f"   Source: {article['source']} {'⭐' * article['credibility']}")
            print(f"   Published: {article['published']}")
            print(f"   Link: {article['url']}\n")

# Run the chatbot
if __name__ == "__main__":
    news_chatbot()

🌍 Global News Aggregator - Type 'exit' to quit
Examples: 'politics news from Germany', 'natural disasters last week', 'vehicle industry news'

What news would you like? indian political

📰 Found 5 articles:

1. As Sales Drop, Tesla Makes a Big Gamble on India
   Source: Gizmodo.com ⭐⭐⭐
   Published: 2025-07-14
   Link: https://gizmodo.com/as-sales-drop-tesla-makes-a-big-gamble-on-india-2000628824

2. Thousands pay tribute to veteran Indian communist leader
   Source: BBC News ⭐⭐⭐
   Published: 2025-07-23
   Link: https://www.bbc.com/news/articles/cx209zl0l8no

3. Report: Apple's India Manufacturing Dream in Jeopardy Over Exodus of Chinese Workers
   Source: MacRumors ⭐⭐⭐
   Published: 2025-07-02
   Link: https://www.macrumors.com/2025/07/02/apples-india-manufacturing-in-jeopardy/

4. Yemen to execute Indian nurse on death row - can she be saved?
   Source: BBC News ⭐⭐⭐
   Published: 2025-07-09
   Link: https://www.bbc.com/news/articles/crrqn0pk5l5o?xtor=AL-72-%5Bpartner%5D-%5Byahoo.nor