In [None]:
import requests
from bs4 import BeautifulSoup
import trafilatura
import json
from typing import Optional, Dict, Any, List, Dict
import logging
import anthropic
import os

cryptonews_api = os.environ.get('CRYPTONEWS_API')
claude_api = os.environ.get('CLAUDE_API')

In [None]:
url = f"https://cryptonews-api.com/api/v1?tickers=BTC&items=10&type=article&sortby=rank&days=3&page=1&token={cryptonews_api}"
response = requests.get(url)
data = response.json()

In [None]:
class ArticleExtractor:
    """A class to extract clean article content from various news websites."""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)
        
    def extract_content(self, url: str) -> Optional[Dict[str, Any]]:
        """
        Extract article content from a given URL using multiple fallback methods.
        
        Args:
            url: The URL of the news article
            
        Returns:
            Dict containing extracted content with keys:
            - title: Article title
            - text: Main article text
            - author: Author name (if available)
            - date: Publication date (if available)
            - summary: Article summary/description (if available)
        """
        try:
            # First try using trafilatura as it's typically most reliable
            downloaded = trafilatura.fetch_url(url)
            if downloaded:
                result = trafilatura.extract(downloaded, include_comments=False, 
                                          include_tables=False, 
                                          output_format='json')
                if result:
                    content = json.loads(result)
                    return {
                        'title': content.get('title', ''),
                        'text': content.get('text', ''),
                        'author': content.get('author', ''),
                        'date': content.get('date', ''),
                        'summary': content.get('description', '')
                    }
            
            # Fallback to BeautifulSoup method
            return self._extract_with_beautifulsoup(url)
            
        except Exception as e:
            self.logger.error(f"Error extracting content from {url}: {str(e)}")
            return None
    
    def _extract_with_beautifulsoup(self, url: str) -> Optional[Dict[str, Any]]:
        """Fallback method using BeautifulSoup with common article patterns."""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements
            for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
                element.decompose()
            
            # Extract title (try multiple common patterns)
            title = ''
            title_candidates = [
                soup.find('meta', property='og:title'),
                soup.find('meta', property='twitter:title'),
                soup.find('h1'),
                soup.find(class_=['article-title', 'entry-title', 'post-title'])
            ]
            for candidate in title_candidates:
                if candidate:
                    title = candidate.get('content', candidate.text.strip())
                    if title:
                        break
            
            # Extract main content (try multiple common patterns)
            content = ''
            content_candidates = [
                soup.find(class_=['article-content', 'entry-content', 'post-content', 'main-content']),
                soup.find('article'),
                soup.find(role='main'),
                soup.find(id=['content', 'main-content', 'article-content'])
            ]
            
            for candidate in content_candidates:
                if candidate:
                    content = ' '.join(p.text.strip() for p in candidate.find_all('p') if p.text.strip())
                    if content:
                        break
            
            # Extract metadata
            author = ''
            author_candidates = [
                soup.find('meta', property='author'),
                soup.find('meta', name='author'),
                soup.find(class_=['author', 'article-author', 'entry-author'])
            ]
            for candidate in author_candidates:
                if candidate:
                    author = candidate.get('content', candidate.text.strip())
                    if author:
                        break
            
            # Extract date
            date = ''
            date_candidates = [
                soup.find('meta', property='article:published_time'),
                soup.find('meta', property='article:modified_time'),
                soup.find(class_=['date', 'article-date', 'entry-date'])
            ]
            for candidate in date_candidates:
                if candidate:
                    date = candidate.get('content', candidate.text.strip())
                    if date:
                        break
                        
            # Extract summary/description
            summary = ''
            summary_candidates = [
                soup.find('meta', property='og:description'),
                soup.find('meta', name='description'),
                soup.find(class_=['article-summary', 'entry-summary'])
            ]
            for candidate in summary_candidates:
                if candidate:
                    summary = candidate.get('content', candidate.text.strip())
                    if summary:
                        break
            
            return {
                'title': title,
                'text': content,
                'author': author,
                'date': date,
                'summary': summary
            }
            
        except Exception as e:
            self.logger.error(f"Error in BeautifulSoup extraction for {url}: {str(e)}")
            return None

In [None]:
extractor = ArticleExtractor()

In [None]:
for d in data['data']:
    content = extractor.extract_content(d['news_url'])
    if content is None:
        d['main_content'] = ''
    else:
        d['main_content'] = content['text']

articles = data['data']

In [None]:
articles

In [None]:
def analyze_bitcoin_news(api_key: str, 
                        news_articles: List[Dict],
                        system_prompt: str = """You are an AI journalist specializing in cryptocurrency news analysis. 
                        Analyze multiple Bitcoin news articles and create a comprehensive summary of the most important developments and trends.
                        Always cite your claims using the provided URLs.""") -> str:
    """
    Analyze multiple Bitcoin news articles and generate a comprehensive summary with citations
    
    Parameters:
    api_key (str): Your Anthropic API key
    news_articles (List[Dict]): List of dictionaries containing news articles
    system_prompt (str): System prompt for Claude
    
    Returns:
    str: Comprehensive analysis of all articles
    """
    
    client = anthropic.Client(api_key=api_key)
    
    # Format all articles into a single text
    formatted_articles = "\n\n===ARTICLE===\n\n".join([
        f"""Title: {article['title']}
        Source: {article['source_name']}
        Date: {article['date']}
        URL: {article['news_url']}
        
        Content:
        {article['main_content']}"""
        for article in news_articles
    ])
    
    user_prompt = """Please analyze these Bitcoin news articles and create a comprehensive summary. Focus on:
    1. Major price movements and market trends
    2. Significant regulatory developments
    3. Notable institutional or corporate developments
    4. Key market sentiment indicators
    
    Requirements:
    - Include a reference list with the URLs at the end. This is very important.
    - Write in a direct journalistic style
    - Organize the information thematically
    - Highlight any emerging trends or patterns across multiple articles
    - Note any conflicting information or perspectives
    - Conclude with the most important takeaways
    
    Here are the articles to analyze:
    
    """ + formatted_articles
    
    try:
        # Make API call to Claude
        message = client.messages.create(
            model="claude-3-opus-20240229",
            max_tokens=4096,  # Increased token limit for comprehensive analysis
            system=system_prompt,
            messages=[
                {
                    "role": "user",
                    "content": user_prompt
                }
            ]
        )
        
        return message.content
        
    except Exception as e:
        print(f"Error processing articles: {str(e)}")
        return None

In [None]:
# Process the articles
result = analyze_bitcoin_news(claude_api, articles)

In [None]:
print(result[0].model_dump()['text'])