In [5]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article, Config
from typing import List, Dict
import time
import re

# Configuration for Newspaper3k
config = Config()
config.request_timeout = 10
config.browser_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"

def search_news(company_name: str) -> List[str]:
    """Search DuckDuckGo News for direct article links."""
    base_url = "https://duckduckgo.com/html/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    params = {"q": f"{company_name} news", "kl": "us-en"}  # Search query
    
    try:
        response = requests.get(base_url, headers=headers, params=params)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article URLs
        articles = soup.select("a.result__a")  # DuckDuckGo result links
        urls = set()

        for link in articles:
            href = link.get("href", "")
            match = re.search(r"(https?://[^\s\"']+)", href)
            if match:
                url = match.group(1)
                if "duckduckgo.com" not in url:  # Avoid DuckDuckGo redirects
                    urls.add(url)

        return list(urls)[:10]  # Return up to 10 valid links

    except requests.RequestException as e:
        print(f"Search failed: {str(e)}")
        return []

def scrape_article(url: str) -> Dict:
    """Scrape news article content using Newspaper3k."""
    article = Article(url, config=config)
    
    try:
        article.download()
        article.parse()
        article.nlp()  # Extract summary and keywords
        
        return {
            "title": article.title,
            "summary": article.summary,
            "full_text": article.text,
            "url": url,
            "date": article.publish_date.strftime("%Y-%m-%d") if article.publish_date else None,
            "keywords": article.keywords,
            "authors": article.authors
        }
        
    except Exception as e:
        print(f"Scraping failed for {url}: {str(e)}")
        return {}

def scrape_news(company_name: str) -> List[Dict]:
    """Main function to search and scrape news articles."""
    urls = search_news(company_name)
    articles = []

    for url in urls:
        article_data = scrape_article(url)
        if article_data.get("title"):  # Ensure valid articles
            articles.append(article_data)
            time.sleep(2)  # Respectful delay between requests
            
    return articles[:10]  # Return up to 10 valid articles

# Example usage
if __name__ == "__main__":
    company = "Tesla"
    news_articles = scrape_news(company)
    
    for article in news_articles:
        print(f"Title: {article['title']}\nSummary: {article['summary']}\nURL: {article['url']}\n")


Scraping failed for https://www.tesla.com/blog/: Article `download()` failed with HTTPSConnectionPool(host='www.tesla.com', port=443): Read timed out. (read timeout=10) on URL https://www.tesla.com/blog/
Scraping failed for https://www.nytimes.com/2025/03/16/business/elon-musk-trump-tesla-conservatives.html: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.nytimes.com/2025/03/16/business/elon-musk-trump-tesla-conservatives.html on URL https://www.nytimes.com/2025/03/16/business/elon-musk-trump-tesla-conservatives.html
Title: Hundreds gather outside Tesla showrooms in backlash to Elon Musk’s role with DOGE
Summary: The demonstrations are part of the “Tesla Takedown” movement, a boycott that began on February 15 against Musk’s electric vehicle company.
More than 80 demonstrations are slated for Saturday and more than 70 are planned through the end of April, according to the Tesla Takedown website.
In the Boston suburb of Dedham, about 100 demonstrators ga

In [6]:
import json
import os
from textblob import TextBlob
import yake
from gtts import gTTS
from deep_translator import GoogleTranslator

def analyze_sentiment(text: str) -> str:
    """Determine the sentiment of a text (Positive, Negative, Neutral)."""
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    
    if sentiment_score > 0.1:
        return "Positive"
    elif sentiment_score < -0.1:
        return "Negative"
    else:
        return "Neutral"

def extract_topics(text: str, num_topics=3) -> list:
    """Extract key topics using YAKE (Yet Another Keyword Extractor)."""
    kw_extractor = yake.KeywordExtractor(lan="en", n=1, top=num_topics)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

def summarize_articles(company: str, articles: list) -> dict:
    """Summarize and analyze news articles for a company."""
    summarized_articles = []
    sentiment_distribution = {"Positive": 0, "Negative": 0, "Neutral": 0}
    
    for article in articles:
        title = article.get("title", "No Title")
        summary = article.get("summary", "No Summary")
        sentiment = analyze_sentiment(summary)
        topics = extract_topics(summary)

        # Update sentiment distribution
        sentiment_distribution[sentiment] += 1

        summarized_articles.append({
            "Title": title,
            "Summary": summary,
            "Sentiment": sentiment,
            "Topics": topics
        })

    # Compare sentiment differences between articles
    comparisons, topic_overlap = compare_articles(summarized_articles)

    # Final sentiment summary
    final_sentiment = generate_final_sentiment(sentiment_distribution)

    # Generate Hindi audio
    audio_file = generate_hindi_audio(final_sentiment)

    return {
        "Company": company,
        "Articles": summarized_articles,
        "Comparative Sentiment Score": {
            "Sentiment Distribution": sentiment_distribution,
            "Coverage Differences": comparisons,
            "Topic Overlap": topic_overlap
        },
        "Final Sentiment Analysis": final_sentiment,
        "Audio": f"[Play Hindi Speech] ({audio_file})"
    }

def compare_articles(articles: list) -> tuple:
    """Compare articles for sentiment differences and topic overlap."""
    comparisons = []
    
    for i in range(len(articles) - 1):
        for j in range(i + 1, len(articles)):
            art1, art2 = articles[i], articles[j]
            comparison_text = (f"Article '{art1['Title']}' focuses on {art1['Topics']}, "
                               f"while article '{art2['Title']}' discusses {art2['Topics']}.")
            impact_text = (f"'{art1['Title']}' is {art1['Sentiment']} news, "
                           f"whereas '{art2['Title']}' is {art2['Sentiment']} news. "
                           "This may lead to mixed market reactions.")
            comparisons.append({"Comparison": comparison_text, "Impact": impact_text})
    
    # Safely compute common topics
    topic_sets = [set(art["Topics"]) for art in articles if art["Topics"]]
    common_topics = list(set.intersection(*topic_sets)) if topic_sets else []
    
    topic_overlap = {
        "Common Topics": common_topics,
        "Unique Topics per Article": {art["Title"]: list(set(art["Topics"])) for art in articles}
    }
    return comparisons, topic_overlap


def generate_hindi_audio(text: str) -> str:
    """Translate text to Hindi and generate an audio file."""
    
    # Translate to Hindi
    hindi_text = GoogleTranslator(source="auto", target="hi").translate(text)
    print("🔹 Translated Hindi Text:", hindi_text)  # Debugging
    
    # Convert to speech
    tts = gTTS(text=hindi_text, lang="hi")
    file_path = "sentiment_analysis_hindi.mp3"
    tts.save(file_path)
    
    return file_path


# Example usage
if __name__ == "__main__":
    # Dummy articles (Replace with real articles from scraper.py)
    articles = [
        {"title": "Tesla's New Model Breaks Sales Records", 
         "summary": "Tesla's latest EV sees record sales in Q3..."},
        {"title": "Regulatory Scrutiny on Tesla's Self-Driving Tech", 
         "summary": "Regulators have raised concerns over Tesla’s self-driving software..."}
    ]

    result = summarize_articles("Tesla", articles)
    print(json.dumps(result, indent=4))


🔹 Translated Hindi Text: समग्र भावना सकारात्मक है, जो कंपनी के विकास के बारे में आशावाद का संकेत देती है।
{
    "Company": "Tesla",
    "Articles": [
        {
            "Title": "Tesla's New Model Breaks Sales Records",
            "Summary": "Tesla's latest EV sees record sales in Q3...",
            "Sentiment": "Positive",
            "Topics": [
                "Tesla",
                "latest",
                "record"
            ]
        },
        {
            "Title": "Regulatory Scrutiny on Tesla's Self-Driving Tech",
            "Summary": "Regulators have raised concerns over Tesla\u2019s self-driving software...",
            "Sentiment": "Neutral",
            "Topics": [
                "Tesla",
                "Regulators",
                "software"
            ]
        }
    ],
    "Comparative Sentiment Score": {
        "Sentiment Distribution": {
            "Positive": 1,
            "Negative": 0,
            "Neutral": 1
        },
        "Coverage Differ