In [None]:
# Cell 1: Setup and Import
import os
import json
import requests
from datetime import datetime, timedelta, timezone
from collections import defaultdict, Counter
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

NEWSAPI_AI_KEY = os.getenv("NEWSAPI_AI_KEY")
NEWSAPI_AI_BASE = "https://newsapi.ai/api/v1/article/getArticlesForTopicPage"



print(f"NewsAPI.ai Key loaded: {'✅' if NEWSAPI_AI_KEY else '❌'}")

if not NEWSAPI_AI_KEY:
    print("\nTo get NewsAPI.ai key:")
    print("1. Go to https://newsapi.ai/")
    print("2. Sign up for free account (10,000 articles/month)")
    print("3. Add to .env: NEWSAPI_AI_KEY=your_key_here")

NewsAPI.ai Key loaded: ✅


In [27]:
# Cell 2: Core Functions for Topic Retrieval
def get_articles_from_topic(topic_uri, topic_name="Topic", max_articles=100):
    """
    Retrieve articles from a specific NewsAPI.ai topic
    
    Args:
        topic_uri: The URI/ID of your topic from NewsAPI.ai
        topic_name: Friendly name for logging
        max_articles: Maximum articles to retrieve
    """
    
    if topic_uri.startswith("YOUR_"):
        print(f"⚠️  Please update {topic_name} topic URI in configuration")
        return []
    
    params = {

        "apiKey": NEWSAPI_AI_KEY,
        "uri": topic_uri,
        "infoArticleBodyLen": -1,
        "resultType": "articles",
        "articlesSortBy": "fq",
        }
    #     # Language
    #     "lang": "eng",
        
    #     # Include all relevant fields
    #     "includeArticleTitle": True,
    #     "includeArticleBody": True,
    #     "includeArticleBasicInfo": True,
    #     "includeArticleCategories": True,
    #     "includeSourceName": True,
    #     "includeSourceLocation": True,
    #     "includeArticleImage": True,
        
    #     # Sorting and filtering
    #     "articlesSortBy": "date",  # or "rel" for relevance
    #     "articlesSortByAsc": False,  # Newest first
    #     "isDuplicateFilter": "skipDuplicates",
        
    #     # Result settings
    #     "articlesCount": max_articles,
    #     "resultType": "articles"
    # }
    
    print(f"\n🔍 Fetching articles from topic: {topic_name}")
    print(f"   Topic URI: {topic_uri}")
    
    try:
        response = requests.get(NEWSAPI_AI_BASE, params=params, timeout=30)
        
        if response.status_code != 200:
            print(f"❌ Error {response.status_code}: {response.text[:200]}")
            return []
        
        data = response.json()
        articles = data.get('articles', {}).get('results', [])
        total = data.get('articles', {}).get('totalResults', 0)
        
        print(f"✅ Retrieved {len(articles)} articles (Total available: {total:,})")
        
        return articles
        
    except Exception as e:
        print(f"❌ Exception: {e}")
        return []

def analyze_articles(articles, topic_name="Topic"):
    """Analyze a set of articles for quality and relevance"""
    
    if not articles:
        print(f"No articles to analyze for {topic_name}")
        return None
    
    analysis = {
        'topic': topic_name,
        'total_articles': len(articles),
        'sources': Counter(),
        'dates': Counter(),
        'categories': Counter(),
        'relevance_scores': [],
        'similarity_scores': [],
        'body_lengths': [],
        'has_image': 0
    }
    
    for article in articles:
        # Source analysis
        source = article.get('source', {}).get('title', 'Unknown')
        analysis['sources'][source] += 1
        
        # Date analysis
        date_str = article.get('date', '')
        if date_str:
            analysis['dates'][date_str] += 1
        
        # Category analysis
        categories = article.get('categories', [])
        for cat in categories:
            if cat.get('label'):
                analysis['categories'][cat['label']] += 1
        
        # Score analysis
        if 'relevance' in article:
            analysis['relevance_scores'].append(article['relevance'])
        if 'sim' in article:
            analysis['similarity_scores'].append(article['sim'])
        
        # Content analysis
        body = article.get('body', '')
        if body:
            analysis['body_lengths'].append(len(body))
        
        if article.get('image'):
            analysis['has_image'] += 1
    
    return analysis

def print_analysis(analysis):
    """Pretty print the analysis results"""
    
    if not analysis:
        return
    
    print(f"\n📊 Analysis for {analysis['topic']}")
    print("="*60)
    
    print(f"Total articles: {analysis['total_articles']}")
    print(f"Articles with images: {analysis['has_image']}")
    
    # Source distribution
    print(f"\nTop 10 Sources:")
    for source, count in analysis['sources'].most_common(10):
        print(f"  {source}: {count}")
    
    # Date distribution
    print(f"\nArticles by date:")
    for date, count in sorted(analysis['dates'].items(), reverse=True)[:7]:
        print(f"  {date}: {count}")
    
    # Category distribution
    if analysis['categories']:
        print(f"\nTop 10 Categories:")
        for cat, count in analysis['categories'].most_common(10):
            print(f"  {cat}: {count}")
    
    # Score statistics
    if analysis['relevance_scores']:
        scores = analysis['relevance_scores']
        print(f"\nRelevance scores:")
        print(f"  Min: {min(scores):.1f}, Max: {max(scores):.1f}, Avg: {sum(scores)/len(scores):.1f}")
    
    if analysis['similarity_scores']:
        scores = analysis['similarity_scores']
        print(f"\nSimilarity scores:")
        print(f"  Min: {min(scores):.1f}, Max: {max(scores):.1f}, Avg: {sum(scores)/len(scores):.1f}")
    
    # Content statistics
    if analysis['body_lengths']:
        lengths = analysis['body_lengths']
        print(f"\nArticle body lengths:")
        print(f"  Min: {min(lengths):,} chars")
        print(f"  Max: {max(lengths):,} chars")
        print(f"  Avg: {sum(lengths)//len(lengths):,} chars")

In [28]:
# Cell 3: Test Individual Topics
# Test each topic individually

# topics = [
#     (DEFENSE_TECH_TOPIC, "Defense Technology"),
#     (DARPA_CONTRACTS_TOPIC, "DARPA Contracts"),
#     (AUTONOMOUS_SYSTEMS_TOPIC, "Autonomous Systems"),
#     (MAINTENANCE_TECH_TOPIC, "Maintenance Technology")
# ]

topics = [
    ("3d9d3ac4-4ee8-479a-bb16-040d8f7f13bd", "Defense Technology"),]

all_topic_articles = {}

for topic_uri, topic_name in topics:
    articles = get_articles_from_topic(topic_uri, topic_name, max_articles=50)
    
    if articles:
        all_topic_articles[topic_name] = articles
        analysis = analyze_articles(articles, topic_name)
        print_analysis(analysis)
    else:
        print(f"\n⚠️  No articles retrieved for {topic_name}")

print(f"\n\n📈 Summary:")
print("="*60)
for topic_name, articles in all_topic_articles.items():
    print(f"{topic_name}: {len(articles)} articles")


🔍 Fetching articles from topic: Defense Technology
   Topic URI: 3d9d3ac4-4ee8-479a-bb16-040d8f7f13bd
✅ Retrieved 100 articles (Total available: 5,791)

📊 Analysis for Defense Technology
Total articles: 100
Articles with images: 94

Top 10 Sources:
  Army Recognition: 9
  Defense News: 6
  indiandefensenews.in: 6
  Market Screener: 4
  Aviation Week: 3
  Visegrád Post: 3
  Manufacturing.net: 2
  machinist.in: 2
  NewsDrum: 2
  SpaceWar: 2

Articles by date:
  2025-08-06: 54
  2025-08-05: 29
  2025-08-04: 8
  2025-08-03: 1
  2025-08-02: 3
  2025-08-01: 5

Relevance scores:
  Min: 80.0, Max: 130.0, Avg: 89.3

Similarity scores:
  Min: 0.0, Max: 0.9, Avg: 0.3

Article body lengths:
  Min: 394 chars
  Max: 9,553 chars
  Avg: 3,150 chars


📈 Summary:
Defense Technology: 100 articles


In [29]:
# Cell 4: Display Sample Articles from Each Topic
def display_sample_articles(articles, topic_name, num_samples=3):
    """Display sample articles from a topic"""
    
    print(f"\n📰 Sample Articles from {topic_name}")
    print("-"*60)
    
    for i, article in enumerate(articles[:num_samples], 1):
        print(f"\n{i}. {article.get('title', 'No title')}")
        print(f"   Source: {article.get('source', {}).get('title', 'Unknown')}")
        print(f"   Date: {article.get('dateTime', 'Unknown')}")
        print(f"   URL: {article.get('url', 'No URL')}")
        
        # Relevance and similarity
        if 'relevance' in article:
            print(f"   Relevance: {article['relevance']}")
        if 'sim' in article:
            print(f"   Similarity: {article['sim']}")
        
        # Categories
        categories = article.get('categories', [])
        if categories:
            cat_labels = [cat.get('label', '') for cat in categories if cat.get('label')][:3]
            if cat_labels:
                print(f"   Categories: {', '.join(cat_labels)}")
        
        # Body preview
        body = article.get('body', '')
        if body:
            preview = body[:200] + "..." if len(body) > 200 else body
            print(f"   Preview: {preview}")

# Display samples from each topic
for topic_name, articles in all_topic_articles.items():
    display_sample_articles(articles, topic_name, num_samples=3)


📰 Sample Articles from Defense Technology
------------------------------------------------------------

1. Raven Space Systems Chooses Colorado for New Headquarters, Manufacturing Facility
   Source: Manufacturing.net
   Date: 2025-08-05T15:15:15Z
   URL: https://www.manufacturing.net/additive-manufacturing/news/22947186/raven-space-systems-chooses-colorado-for-new-headquarters-manufacturing-facility
   Relevance: 130
   Similarity: 0
   Preview: BROOMFIELD - Raven Space Systems, a 3D printing company that specializes in aerospace-grade composites, selected Broomfield, Colorado, for its new headquarters and manufacturing facility.

The company...

2. US Army readies to release new missile defense strategy soon
   Source: Defense News
   Date: 2025-08-05T23:41:15Z
   URL: https://www.defensenews.com/land/2025/08/05/us-army-readies-to-release-new-missile-defense-strategy-soon/
   Relevance: 110
   Similarity: 0
   Preview: HUNTSVILLE, Ala. -- The U.S. Army is about three months away fro

In [30]:
# Cell 5: Cross-Topic Analysis
def cross_topic_analysis(all_articles_dict):
    """Compare articles across different topics"""
    
    print("\n🔄 Cross-Topic Analysis")
    print("="*60)
    
    # Find overlapping sources
    all_sources = {}
    for topic_name, articles in all_articles_dict.items():
        sources = set()
        for article in articles:
            source = article.get('source', {}).get('title', 'Unknown')
            sources.add(source)
        all_sources[topic_name] = sources
    
    print("\n📡 Source Coverage by Topic:")
    for topic_name, sources in all_sources.items():
        print(f"{topic_name}: {len(sources)} unique sources")
    
    # Find sources that appear in multiple topics
    if len(all_sources) > 1:
        print("\n🔗 Sources appearing in multiple topics:")
        source_topics = defaultdict(list)
        for topic_name, sources in all_sources.items():
            for source in sources:
                source_topics[source].append(topic_name)
        
        multi_topic_sources = {s: topics for s, topics in source_topics.items() if len(topics) > 1}
        for source, topics in sorted(multi_topic_sources.items(), key=lambda x: len(x[1]), reverse=True)[:10]:
            print(f"  {source}: {', '.join(topics)}")
    
    # Date coverage analysis
    print("\n📅 Date Coverage:")
    for topic_name, articles in all_articles_dict.items():
        dates = set()
        for article in articles:
            date = article.get('date', '')
            if date:
                dates.add(date)
        if dates:
            print(f"{topic_name}: {min(dates)} to {max(dates)} ({len(dates)} days)")
    
    # Find potentially duplicate articles across topics
    print("\n🔍 Checking for duplicate articles across topics...")
    all_urls = defaultdict(list)
    for topic_name, articles in all_articles_dict.items():
        for article in articles:
            url = article.get('url', '')
            if url:
                all_urls[url].append(topic_name)
    
    duplicates = {url: topics for url, topics in all_urls.items() if len(topics) > 1}
    if duplicates:
        print(f"Found {len(duplicates)} articles appearing in multiple topics")
        for url, topics in list(duplicates.items())[:5]:
            print(f"  Article in: {', '.join(topics)}")
    else:
        print("No duplicate articles found across topics")

# Run cross-topic analysis
if all_topic_articles:
    cross_topic_analysis(all_topic_articles)


🔄 Cross-Topic Analysis

📡 Source Coverage by Topic:
Defense Technology: 65 unique sources

📅 Date Coverage:
Defense Technology: 2025-08-01 to 2025-08-06 (6 days)

🔍 Checking for duplicate articles across topics...
No duplicate articles found across topics


In [31]:
# Cell 6: Export Results for Pipeline Testing
def export_for_pipeline(articles, topic_name):
    """Export articles in a format suitable for your NiFi/Airflow pipeline"""
    
    export_data = []
    
    for article in articles:
        # Transform to your pipeline's expected format
        doc_id = article.get('uri', '').replace('/', '_') or f"newsapi_ai_{hash(article.get('url', ''))}"
        
        export_item = {
            'doc_id': doc_id,
            'title': article.get('title', ''),
            'content': article.get('body', ''),
            'description': article.get('body', '')[:500] if article.get('body') else '',
            'link': article.get('url', ''),
            'published_at': article.get('dateTime', ''),
            'source': article.get('source', {}).get('title', 'Unknown'),
            'source_type': 'news_article',
            'topic': topic_name,
            'relevance_score': article.get('relevance', 0),
            'similarity_score': article.get('sim', 0),
            'categories': [cat.get('label', '') for cat in article.get('categories', []) if cat.get('label')],
            'ingestion_timestamp': datetime.utcnow().isoformat()
        }
        
        export_data.append(export_item)
    
    return export_data

# Export all topics
all_exports = {}
for topic_name, articles in all_topic_articles.items():
    export_data = export_for_pipeline(articles, topic_name)
    all_exports[topic_name] = export_data
    
    # Save to JSON file
    filename = f"newsapi_ai_{topic_name.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.json"
    with open(filename, 'w') as f:
        json.dump(export_data, f, indent=2)
    
    print(f"✅ Exported {len(export_data)} articles to {filename}")

# Create a combined export
if all_exports:
    combined = []
    for topic_name, data in all_exports.items():
        combined.extend(data)
    
    combined_filename = f"newsapi_ai_all_topics_{datetime.now().strftime('%Y%m%d')}.json"
    with open(combined_filename, 'w') as f:
        json.dump(combined, f, indent=2)
    
    print(f"\n✅ Combined export: {len(combined)} articles to {combined_filename}")

✅ Exported 100 articles to newsapi_ai_defense_technology_20250806.json

✅ Combined export: 100 articles to newsapi_ai_all_topics_20250806.json


In [44]:
# Cell 4: Analyze Response Structure
if defense_articles:
    print("\n📋 Article Structure Analysis")
    print("=" * 50)
    
    # Look at first article structure
    first_article = defense_articles[0]
    print("Available fields in article:")
    for key in first_article.keys():
        value_type = type(first_article[key]).__name__
        print(f"  - {key}: {value_type}")
    
    # Check nested structures
    if 'source' in first_article:
        print("\nSource fields:")
        for key in first_article['source'].keys():
            print(f"  - source.{key}: {type(first_article['source'][key]).__name__}")



📋 Article Structure Analysis
Available fields in article:
  - uri: str
  - lang: str
  - isDuplicate: bool
  - date: str
  - time: str
  - dateTime: str
  - dateTimePub: str
  - dataType: str
  - sim: int
  - url: str
  - title: str
  - body: str
  - source: dict
  - authors: list
  - image: str
  - eventUri: NoneType
  - sentiment: float
  - wgt: int
  - relevance: int

Source fields:
  - source.uri: str
  - source.dataType: str
  - source.title: str


In [45]:
# Cell 5: Content Quality Analysis
print("\n📊 Content Quality Analysis")
print("=" * 50)

content_stats = {
    'has_body': 0,
    'has_title': 0,
    'has_image': 0,
    'body_lengths': [],
    'title_lengths': []
}

for article in defense_articles:
    if article.get('title'):
        content_stats['has_title'] += 1
        content_stats['title_lengths'].append(len(article['title']))
    
    if article.get('body'):
        content_stats['has_body'] += 1
        content_stats['body_lengths'].append(len(article['body']))
    
    if article.get('image'):
        content_stats['has_image'] += 1

print(f"Articles with title: {content_stats['has_title']}/{len(defense_articles)}")
print(f"Articles with body: {content_stats['has_body']}/{len(defense_articles)}")
print(f"Articles with image: {content_stats['has_image']}/{len(defense_articles)}")

if content_stats['body_lengths']:
    print(f"\nBody length stats:")
    print(f"  Min: {min(content_stats['body_lengths'])} chars")
    print(f"  Max: {max(content_stats['body_lengths'])} chars")
    print(f"  Avg: {sum(content_stats['body_lengths']) // len(content_stats['body_lengths'])} chars")



📊 Content Quality Analysis
Articles with title: 100/100
Articles with body: 100/100
Articles with image: 100/100

Body length stats:
  Min: 57 chars
  Max: 17535 chars
  Avg: 2549 chars


In [46]:
# Cell 6: Source Analysis
print("\n📰 Source Analysis")
print("=" * 50)

sources = Counter()
source_countries = Counter()

for article in defense_articles:
    source = article.get('source', {})
    source_name = source.get('title', 'Unknown')
    sources[source_name] += 1
    
    # Check if location info is available
    if 'location' in source:
        country = source.get('location', {}).get('country', {}).get('label', {}).get('eng', 'Unknown')
        source_countries[country] += 1

print(f"Total unique sources: {len(sources)}")
print("\nTop 15 sources:")
for source, count in sources.most_common(15):
    print(f"  {source}: {count} articles")

if source_countries:
    print(f"\nArticles by country:")
    for country, count in source_countries.most_common(10):
        print(f"  {country}: {count} articles")



📰 Source Analysis
Total unique sources: 71

Top 15 sources:
  The Manila times: 4 articles
  Sports Illustrated: 3 articles
  Weston Mercury: 3 articles
  Colorado Springs Gazette: 3 articles
  WGXA: 3 articles
  Daily Voice: 3 articles
  Daily Mail Online: 3 articles
  odessa-journal.com: 3 articles
  Daily Times: 2 articles
  Kalkine Media: 2 articles
  The Boston Globe: 2 articles
  WXXV 25: 2 articles
  Yorkregion.com: 2 articles
  WXLV: 2 articles
  GhanaWeb: 2 articles


In [47]:
# Cell 7: Date Distribution
print("\n📅 Date Distribution")
print("=" * 50)

date_distribution = Counter()

for article in defense_articles:
    date_str = article.get('dateTime', '')
    if date_str:
        try:
            date = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
            date_key = date.strftime('%Y-%m-%d')
            date_distribution[date_key] += 1
        except:
            pass

for date, count in sorted(date_distribution.items(), reverse=True):
    print(f"{date}: {count} articles")


📅 Date Distribution
2025-08-05: 100 articles


In [48]:
# Cell 8: Display Sample Articles
print("\n📄 Sample Articles")
print("=" * 50)

for i, article in enumerate(defense_articles[:5], 1):
    print(f"\n{i}. {article.get('title', 'No title')}")
    print(f"   Source: {article.get('source', {}).get('title', 'Unknown')}")
    print(f"   Date: {article.get('dateTime', 'Unknown')}")
    print(f"   URL: {article.get('url', 'No URL')}")
    
    body = article.get('body', '')
    if body:
        preview = body[:200] + "..." if len(body) > 200 else body
        print(f"   Preview: {preview}")


📄 Sample Articles

1. 'Clearly that's his opinion': Mike Johnson swatted down by Trump admin
   Source: Raw Story
   Date: 2025-08-05T20:10:16Z
   URL: https://www.rawstory.com/mike-johnson-israel/
   Preview: House Speaker Mike Johnson (R-LA) did not receive resounding support from the Trump administration over this week's comments regarding Israel and Gaza.

During Tuesday's Pentagon news briefing, a repo...

2. England face searching Ashes questions after India series thriller - Daily Times
   Source: Daily Times
   Date: 2025-08-05T20:10:10Z
   URL: https://dailytimes.com.pk/1348832/england-face-searching-ashes-questions-after-india-series-thriller/
   Preview: Their next major red-ball assignment is a five-match Ashes series away to arch-rivals Australia -- where England have gone 15 Tests without a win -- starting in November.

Below AFP Sport looks at som...

3. Sixers Big Man Named to National Team Roster
   Source: Sports Illustrated
   Date: 2025-08-05T20:10:03Z
   URL: http