In [37]:
# Cell 1: Setup and Import
import os
import json
import requests
from datetime import datetime, timedelta, timezone
from collections import defaultdict, Counter
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

newsapi_ai_key = os.getenv("NEWSAPI_AI_KEY")
print(f"NewsAPI.ai Key loaded: {'✅' if newsapi_ai_key else '❌'}")

if not newsapi_ai_key:
    print("\nTo get NewsAPI.ai key:")
    print("1. Go to https://newsapi.ai/")
    print("2. Sign up for free account (10,000 articles/month)")
    print("3. Add to .env: NEWSAPI_AI_KEY=your_key_here")

NewsAPI.ai Key loaded: ✅


In [41]:
# Cell 2: Test Basic Connection
# Correct endpoint URL
newsapi_ai_base = "https://newsapi.ai/api/v1/article/getArticles"

# Simple test query
test_params = {
    "apiKey": newsapi_ai_key,
    "q": "automated maintenance, repair, and inspection technologies",
    "lang": "eng",
    "articlesCount": 5,
    "includeArticleTitle": True,
    "resultType": "articles"
}

print("Testing NewsAPI.ai connection...")
try:
    response = requests.get(newsapi_ai_base, params=test_params, timeout=30)
    print(f"Status Code: {response.status_code}")
    
    if response.status_code == 200:
        data = response.json()
        print("✅ Connection successful!")
        print(f"Response keys: {list(data.keys())}")
        
        if 'articles' in data:
            total = data['articles'].get('totalResults', 0)
            articles = data['articles'].get('results', [])
            print(f"\nTotal articles available: {total:,}")
            print(f"Articles returned: {len(articles)}")
    else:
        print(f"❌ Error: {response.text}")
        
except Exception as e:
    print(f"❌ Connection error: {e}")

Testing NewsAPI.ai connection...
Status Code: 200
✅ Connection successful!
Response keys: ['articles']

Total articles available: 4,533,439
Articles returned: 5


In [42]:
# Cell 3: Defense Industry Query
# Test with defense-specific query
defense_params = {
    "apiKey": newsapi_ai_key,
    "q": "automated maintenance, repair, and inspection technologies",
    "lang": "eng",
    "dateFrom": (datetime.now(timezone.utc) - timedelta(days=5)).strftime('%Y-%m-%d'),
    "dateTo": datetime.now(timezone.utc).strftime('%Y-%m-%d'),
    "includeArticleTitle": True,
    "includeArticleBody": True,
    "includeSourceName": True,
    "includeArticleImage": True,
    "articlesSortBy": "date",  # or "rel" for relevance
    "articlesCount": 100,
    "resultType": "articles"
}

print(f"\n{'='*60}")
print("DEFENSE INDUSTRY SEARCH")
print(f"{'='*60}")
print(f"Query: {defense_params['q']}")
print(f"Date range: {defense_params['dateFrom']} to {defense_params['dateTo']}")

response = requests.get(newsapi_ai_base, params=defense_params, timeout=30)

if response.status_code == 200:
    defense_data = response.json()
    defense_articles = defense_data.get('articles', {}).get('results', [])
    total_available = defense_data.get('articles', {}).get('totalResults', 0)
    
    print(f"\nResults:")
    print(f"  Total available: {total_available:,}")
    print(f"  Retrieved: {len(defense_articles)}")
else:
    print(f"Error: {response.status_code} - {response.text}")
    defense_articles = []



DEFENSE INDUSTRY SEARCH
Query: automated maintenance, repair, and inspection technologies
Date range: 2025-07-31 to 2025-08-05

Results:
  Total available: 4,526,292
  Retrieved: 100


In [51]:
# Cell 3a: Debug Date Filtering
print("🔍 Testing Date Filter Behavior")
print("=" * 60)

# Test 1: Very narrow date range (just today)
today = datetime.now(timezone.utc).strftime('%Y-%m-%d')

narrow_params = {
    "apiKey": newsapi_ai_key,
    "q": "automated maintenance",
    "lang": "eng",
    "dateFrom": today,
    "dateTo": today,
    "articlesCount": 10,
    "articlesSortBy": "date",
    "resultType": "articles"
}

response = requests.get(newsapi_ai_base, params=narrow_params)
if response.status_code == 200:
    data = response.json()
    total = data.get('articles', {}).get('totalResults', 0)
    articles = data.get('articles', {}).get('results', [])
    
    print(f"Test 1 - Just today ({today}):")
    print(f"  Total available: {total:,}")
    print(f"  Articles retrieved: {len(articles)}")
    
    # Check actual dates of articles
    if articles:
        print("\n  Actual article dates:")
        for i, article in enumerate(articles[:5]):
            article_date = article.get('dateTime', 'No date')
            print(f"    {i+1}. {article_date}")

# Test 2: Check without date filter
print("\n" + "-" * 60)
no_date_params = {
    "apiKey": newsapi_ai_key,
    "q": "automated maintenance",
    "lang": "eng",
    "articlesCount": 10,
    "resultType": "articles"
}

response = requests.get(newsapi_ai_base, params=no_date_params)
if response.status_code == 200:
    data = response.json()
    total = data.get('articles', {}).get('totalResults', 0)
    
    print(f"Test 2 - No date filter:")
    print(f"  Total available: {total:,}")

# Test 3: Check the actual URL being sent
print("\n" + "-" * 60)
print("Test 3 - Actual request URL:")
test_url = newsapi_ai_base + "?" + "&".join([f"{k}={v}" for k, v in narrow_params.items() if k != 'apiKey'])
print(f"  {test_url}&apiKey=***")

🔍 Testing Date Filter Behavior
Test 1 - Just today (2025-08-05):
  Total available: 4,532,994
  Articles retrieved: 10

  Actual article dates:
    1. 2025-08-05T19:16:06Z
    2. 2025-08-05T19:16:04Z
    3. 2025-08-05T19:15:56Z
    4. 2025-08-05T19:15:55Z
    5. 2025-08-05T19:15:54Z

------------------------------------------------------------
Test 2 - No date filter:
  Total available: 4,664,333

------------------------------------------------------------
Test 3 - Actual request URL:
  https://newsapi.ai/api/v1/article/getArticles?q=automated maintenance&lang=eng&dateFrom=2025-08-05&dateTo=2025-08-05&articlesCount=10&articlesSortBy=date&resultType=articles&apiKey=***


In [44]:
# Cell 4: Analyze Response Structure
if defense_articles:
    print("\n📋 Article Structure Analysis")
    print("=" * 50)
    
    # Look at first article structure
    first_article = defense_articles[0]
    print("Available fields in article:")
    for key in first_article.keys():
        value_type = type(first_article[key]).__name__
        print(f"  - {key}: {value_type}")
    
    # Check nested structures
    if 'source' in first_article:
        print("\nSource fields:")
        for key in first_article['source'].keys():
            print(f"  - source.{key}: {type(first_article['source'][key]).__name__}")



📋 Article Structure Analysis
Available fields in article:
  - uri: str
  - lang: str
  - isDuplicate: bool
  - date: str
  - time: str
  - dateTime: str
  - dateTimePub: str
  - dataType: str
  - sim: int
  - url: str
  - title: str
  - body: str
  - source: dict
  - authors: list
  - image: str
  - eventUri: NoneType
  - sentiment: float
  - wgt: int
  - relevance: int

Source fields:
  - source.uri: str
  - source.dataType: str
  - source.title: str


In [45]:
# Cell 5: Content Quality Analysis
print("\n📊 Content Quality Analysis")
print("=" * 50)

content_stats = {
    'has_body': 0,
    'has_title': 0,
    'has_image': 0,
    'body_lengths': [],
    'title_lengths': []
}

for article in defense_articles:
    if article.get('title'):
        content_stats['has_title'] += 1
        content_stats['title_lengths'].append(len(article['title']))
    
    if article.get('body'):
        content_stats['has_body'] += 1
        content_stats['body_lengths'].append(len(article['body']))
    
    if article.get('image'):
        content_stats['has_image'] += 1

print(f"Articles with title: {content_stats['has_title']}/{len(defense_articles)}")
print(f"Articles with body: {content_stats['has_body']}/{len(defense_articles)}")
print(f"Articles with image: {content_stats['has_image']}/{len(defense_articles)}")

if content_stats['body_lengths']:
    print(f"\nBody length stats:")
    print(f"  Min: {min(content_stats['body_lengths'])} chars")
    print(f"  Max: {max(content_stats['body_lengths'])} chars")
    print(f"  Avg: {sum(content_stats['body_lengths']) // len(content_stats['body_lengths'])} chars")



📊 Content Quality Analysis
Articles with title: 100/100
Articles with body: 100/100
Articles with image: 100/100

Body length stats:
  Min: 57 chars
  Max: 17535 chars
  Avg: 2549 chars


In [46]:
# Cell 6: Source Analysis
print("\n📰 Source Analysis")
print("=" * 50)

sources = Counter()
source_countries = Counter()

for article in defense_articles:
    source = article.get('source', {})
    source_name = source.get('title', 'Unknown')
    sources[source_name] += 1
    
    # Check if location info is available
    if 'location' in source:
        country = source.get('location', {}).get('country', {}).get('label', {}).get('eng', 'Unknown')
        source_countries[country] += 1

print(f"Total unique sources: {len(sources)}")
print("\nTop 15 sources:")
for source, count in sources.most_common(15):
    print(f"  {source}: {count} articles")

if source_countries:
    print(f"\nArticles by country:")
    for country, count in source_countries.most_common(10):
        print(f"  {country}: {count} articles")



📰 Source Analysis
Total unique sources: 71

Top 15 sources:
  The Manila times: 4 articles
  Sports Illustrated: 3 articles
  Weston Mercury: 3 articles
  Colorado Springs Gazette: 3 articles
  WGXA: 3 articles
  Daily Voice: 3 articles
  Daily Mail Online: 3 articles
  odessa-journal.com: 3 articles
  Daily Times: 2 articles
  Kalkine Media: 2 articles
  The Boston Globe: 2 articles
  WXXV 25: 2 articles
  Yorkregion.com: 2 articles
  WXLV: 2 articles
  GhanaWeb: 2 articles


In [47]:
# Cell 7: Date Distribution
print("\n📅 Date Distribution")
print("=" * 50)

date_distribution = Counter()

for article in defense_articles:
    date_str = article.get('dateTime', '')
    if date_str:
        try:
            date = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
            date_key = date.strftime('%Y-%m-%d')
            date_distribution[date_key] += 1
        except:
            pass

for date, count in sorted(date_distribution.items(), reverse=True):
    print(f"{date}: {count} articles")


📅 Date Distribution
2025-08-05: 100 articles


In [48]:
# Cell 8: Display Sample Articles
print("\n📄 Sample Articles")
print("=" * 50)

for i, article in enumerate(defense_articles[:5], 1):
    print(f"\n{i}. {article.get('title', 'No title')}")
    print(f"   Source: {article.get('source', {}).get('title', 'Unknown')}")
    print(f"   Date: {article.get('dateTime', 'Unknown')}")
    print(f"   URL: {article.get('url', 'No URL')}")
    
    body = article.get('body', '')
    if body:
        preview = body[:200] + "..." if len(body) > 200 else body
        print(f"   Preview: {preview}")


📄 Sample Articles

1. 'Clearly that's his opinion': Mike Johnson swatted down by Trump admin
   Source: Raw Story
   Date: 2025-08-05T20:10:16Z
   URL: https://www.rawstory.com/mike-johnson-israel/
   Preview: House Speaker Mike Johnson (R-LA) did not receive resounding support from the Trump administration over this week's comments regarding Israel and Gaza.

During Tuesday's Pentagon news briefing, a repo...

2. England face searching Ashes questions after India series thriller - Daily Times
   Source: Daily Times
   Date: 2025-08-05T20:10:10Z
   URL: https://dailytimes.com.pk/1348832/england-face-searching-ashes-questions-after-india-series-thriller/
   Preview: Their next major red-ball assignment is a five-match Ashes series away to arch-rivals Australia -- where England have gone 15 Tests without a win -- starting in November.

Below AFP Sport looks at som...

3. Sixers Big Man Named to National Team Roster
   Source: Sports Illustrated
   Date: 2025-08-05T20:10:03Z
   URL: http