In [33]:
import requests
import json
from google.cloud import bigquery
import os
from datetime import datetime, timedelta

In [34]:
# Initialise News API
NEWS_API_KEY = ""
NEWS_API_URL = "https://newsapi.org/v2/everything"


In [46]:
# Initialize BigQuery client
client = bigquery.Client(project="news-bias-detection-439208")

# BigQuery table details
dataset_id = "news_data"
table_id = "articles"

def fetch_news():
    """
    Fetch news articles from NewsAPI
    """
    # Calculate date range
    end_date = datetime.now().date()
    start_date = end_date - timedelta(days=30)

    params = {
        "apiKey": NEWS_API_KEY,
        "q": "AI",
        "language": "en",
        "from": start_date.isoformat(),
        "to": end_date.isoformat(),
    }

    response = requests.get(NEWS_API_URL, params=params)
    all_articles = response.json().get("articles", [])
    print(f"Articles {len(all_articles)}")
    return all_articles

In [47]:
all_articles = fetch_news()

Articles 100


In [48]:
def insert_new_rows_into_bigquery(rows):
    """
    Insert only new rows of articles into BigQuery
    """
    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)  # API call

    # Prepare the insert query
    insert_query = f"""
    INSERT INTO `{table.project}.{table.dataset_id}.{table.table_id}`
    (source, author, title, description, url, published_at, content)
    SELECT 
        R.source,
        R.author,
        R.title,
        R.description,
        R.url,
        R.published_at,
        R.content
    FROM UNNEST(@rows) as R
    WHERE R.url NOT IN (
        SELECT url FROM `{table.project}.{table.dataset_id}.{table.table_id}`
    )
    """

    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ArrayQueryParameter("rows", "STRUCT<source STRING, author STRING, title STRING, description STRING, url STRING, published_at STRING, content STRING>", rows)
        ]
    )

    # Run the query
    query_job = client.query(insert_query, job_config=job_config)
    query_job.result()  # Wait for the job to complete

    print(f"{query_job.num_affected_rows} new rows inserted into BigQuery")

def transform_article(article):
    """
    Transform the article into the desired format for BigQuery
    """
    return {
        "source": article["source"]["name"],  # Extract source name
        "author": article.get("author"),  # Author might be missing, so .get() is used
        "title": article["title"],  # Article title is mandatory
        "description": article.get("description"),  # Optional
        "url": article["url"],  # Mandatory
        "published_at": article["publishedAt"],  # Keep this as a string in ISO8601 format
        "content": article.get("content")  # Optional content field
    }

In [49]:
# Fetch news articles
articles = fetch_news()
print("fetched news")

# Transform articles for BigQuery
rows = [transform_article(article) for article in articles]
print("transformed rows")

Articles 100
fetched news
transformed rows


In [41]:
len(rows)

100

In [51]:
rows

[{'source': 'Yahoo Entertainment',
  'author': 'Jeremy Gan',
  'title': 'ByteDance will reportedly use Huawei chips to train a new AI model',
  'description': 'As first reported by Reuters, ByteDance, the Chinese parent company of TikTok, is planning to train and develop an AI model\r\n using chips from fellow Chinese company Huawei. Three anonymous sources approached Reuters with this information; a fourth source cou…',
  'url': 'https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_696f4cdb-5436-43e1-b3cd-6cdaee93cbe7',
  'published_at': '2024-09-30T15:48:46Z',
  'content': "If you click 'Accept all', we and our partners, including 240 who are part of the IAB Transparency &amp; Consent Framework, will also store and/or access information on a device (in other words, use … [+678 chars]"},
 {'source': 'Yahoo Entertainment',
  'author': 'Will Shanklin',
  'title': 'Google stuffs more AI into search',
  'description': 'Google is adding more AI to search. On Thursday, the com

In [50]:
# Insert new data into BigQuery
insert_new_rows_into_bigquery(rows)
print("inserted new rows into bq")

BadRequest: 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/news-bias-detection-439208/jobs?prettyPrint=false: Invalid value for type: STRUCT<source STRING, author STRING, title STRING, description STRING, url STRING, published_at STRING, content STRING> is not a valid value

Location: None
Job ID: 9fcda5f1-d1e0-4b34-8ea5-bc2da5d20aa0
