In [12]:
import requests
import json
import os
from datetime import datetime

In [13]:
# Initialise News API
NEWS_API_KEY = ""
NEWS_API_URL = "https://newsapi.org/v2/everything"


In [14]:
def fetch_news():
    """
    Fetch news articles from NewsAPI
    """
    params = {
      "apiKey": NEWS_API_KEY,
      "q": "AI",
      "language": "en",
      "from": "2024-09-20",
      "to": "2024-09-21"
      }

    response = requests.get(NEWS_API_URL, params=params)
    all_articles = response.json().get("articles", [])
    print(f"Articles {len(all_articles)}")
    return all_articles

In [15]:
all_articles = fetch_news()

Articles 0


In [19]:
params = {
        "apiKey": NEWS_API_KEY,
        "q": "AI",
        "language": "en",
        "from": "2024-09-23",
        "to": "2024-09-24"
        
    }

In [20]:
response = requests.get(NEWS_API_URL, params=params)

In [22]:
response.text

'{'

In [23]:
all_articles = response.json().get("articles", [])

In [24]:
all_articles

[{'source': {'id': 'wired', 'name': 'Wired'},
  'author': 'Reece Rogers',
  'title': 'Generative AI Hype Feels Inescapable. Tackle It Head On With Education',
  'description': 'In their book AI Snake Oil, two Princeton researchers pinpoint the culprits of the AI hype cycle and advocate for a more critical, holistic understanding of artificial intelligence.',
  'url': 'https://www.wired.com/story/artificial-intelligence-hype-ai-snake-oil/',
  'urlToImage': 'https://media.wired.com/photos/66f1dd5ef403832d553d6bce/191:100/w_1280,c_limit/ai-snake-oil.jpg',
  'publishedAt': '2024-09-24T09:00:00Z',
  'content': 'Arvind Narayanan, a computer science professor at Princeton University, is best known for calling out the hype surrounding artificial intelligence in his Substack, AI Snake Oil, written with PhD cand… [+3987 chars]'},
 {'source': {'id': None, 'name': '[Removed]'},
  'author': None,
  'title': '[Removed]',
  'description': '[Removed]',
  'url': 'https://removed.com',
  'urlToImage': N

In [44]:

fetch_news()

Articles 100


[{'source': {'id': None, 'name': 'Gizmodo.com'},
  'author': 'Todd Feathers',
  'title': 'Microsoft Deal Will Bring Nuclear Power Back to Three Mile Island',
  'description': 'As AI ramps up carbon emissions, Microsoft is going nuclear.',
  'url': 'https://gizmodo.com/microsoft-deal-will-bring-nuclear-power-back-to-three-mile-island-2000501273',
  'urlToImage': 'https://gizmodo.com/app/uploads/2024/09/three-mile-microsoft-constellation-nuclear.jpg',
  'publishedAt': '2024-09-20T14:25:56Z',
  'content': 'A nuclear power plant at Pennsylvania’s Three Mile Island will be recommissioned thanks to a power purchase deal with Microsoft aimed at offsetting the tech giant’s carbon emissions.\r\nThree Mile Isla… [+2298 chars]'},
 {'source': {'id': 'wired', 'name': 'Wired'},
  'author': 'Morgan Meaker',
  'title': 'Xavier Niel, a Driving Force of French AI, Is Now Shaping TikTok',
  'description': 'The TikTok owner’s newest board member revels in challenging the establishment.',
  'url': 'https:/

In [None]:
import requests
import json
from google.cloud import bigquery
import os
from datetime import datetime


# Initialise News API
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
if not NEWS_API_KEY:
    raise ValueError("News API Key is missing from environment variables")
NEWS_API_URL = "https://newsapi.org/v2/everything"


# Initialize BigQuery client
client = bigquery.Client(project="news-bias-detection-439208")

# BigQuery table details
dataset_id = "news_data"
table_id = "articles"

def fetch_news():
    """
    Fetch news articles from NewsAPI
    """
    params = {
      "apiKey": NEWS_API_KEY,
      "q": "AI",
      "language": "en",
      "from": "2024-09-20",
      "to": "2024-09-21"
      }

    response = requests.get(NEWS_API_URL, params=params)
    all_articles = response.json().get("articles", [])
    print(f"Articles {len(all_articles)}")
    return all_articles

def insert_into_bigquery(rows):
    """
    Insert rows of articles into BigQuery
    """
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)  # API call

    # Insert rows into BigQuery
    errors = client.insert_rows_json(table, rows)
    if errors:
        print(f"Errors: {errors}")
    else:
        print(f"{len(rows)} rows inserted into BigQuery")

def transform_article(article):
    try:
        published_at = article.get("publishedAt")
        if published_at:
            published_at = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")
        else:
            published_at = None
    except Exception as e:
        print(f"Error processing article: {article}")
        published_at = None

    return {
        "source": article["source"]["name"],
        "author": article.get("author"),
        "title": article["title"],
        "description": article.get("description"),
        "url": article["url"],
        "published_at": published_at,
        "content": article.get("content")
    }




def newsapi_to_bigquery(request):
    """
    Cloud Function Entry Point: Fetches news from NewsAPI and inserts into BigQuery
    """
    try:
        # Fetch news articles
        articles = fetch_news()
        print("fetched news")

        # Transform articles for BigQuery
        rows = [transform_article(article) for article in articles]
        print("transformed rows")
        # Insert data into BigQuery
        insert_into_bigquery(rows)
        print("inserted into bq")

        return "Data ingestion complete", 200
    except Exception as e:
        print(f"Error: {str(e)}")
        return f"Error: {str(e)}", 500
