# NEWS API Ingestion (Dynamic)

### Pre Run Activity
- Make sure to tap the 3 dot icon on top left and enable the necessary 'External Access' integrations for the notebook.
- Update the 'config.json' with the requirement before starting the run.

## Runtime Config Set-Up

In [None]:
GLOBAL_CONFIG_PATH = "config.json"

### Importing Libraries

In [None]:
import requests
import json
import pandas as pd
from datetime import date, timedelta, datetime
import math
import warnings
from bs4 import BeautifulSoup
import time
import random
import snowflake.connector
from urllib.parse import urlparse
import hashlib
from snowflake.connector.pandas_tools import write_pandas

### Auto Run-Name Initiationa Code

In [None]:
def get_config(CONFIG_PATH="config.json"):
    with open(CONFIG_PATH) as f:
        config = json.load(f)
    return config
config = get_config(GLOBAL_CONFIG_PATH) #DEBUG

GLOBAL_RUN_NAME = config['run_auto_pickup']

### Testing Endpoint

In [None]:
resp = requests.get('https://newsapi.org/v2/everything?q=apple&to=2025-09-10&from=2025-09-10&apiKey=c494f280427646c78473013990b3cd45')
len(resp.json())

## Main Code

In [None]:
def get_config_run(run_name):
    config = get_config()
    config_run = next((r for r in config['runs'] if r['run_name'] == run_name), None)
    return config_run

def get_config_snowflake():
    config = get_config()
    config_snowflake = config["snowflake"]
    return config_snowflake

def get_snowlfake_conn(schema_name):
    config_snowflake = get_config_snowflake()
    conn = snowflake.connector.connect(
        user=config_snowflake["user"],
        password=config_snowflake["password"],
        account=config_snowflake["account"],
        warehouse=config_snowflake["warehouse"],
        database=config_snowflake["database"],
        schema=schema_name # SQL Specific
    )
    return conn
    
config = get_config() #DEBUG
config_run = get_config_run(GLOBAL_RUN_NAME)
config_snowflake = get_config_snowflake()
print(config, config_run, config_snowflake)

In [None]:
def create_url_string(config_run, config):
    # calc_to = date.today() #This part will get auto-optimised when set to recurring.
    calc_to = date.today() + timedelta(days=config_run["till_days"]) #If negative then the range will reduce.
    calc_from = calc_to - timedelta(days=config_run["from_days_ago"])
    calc_from = calc_from.isoformat()
    
    url_string = f"""
        {config_run["endpoint"]}?
        q={config_run["q"]}&
        language={config_run["language"]}&
        to={calc_to}&
        from={calc_from}&
        sortBy={config_run["sortBy"]}&
        apiKey={config["news_api_key_list"][config["news_api_key_auto_pickup"]]}
    """
    url_string = url_string.replace(' ', '').replace('\n', '')
    return url_string

config_run = get_config_run(GLOBAL_RUN_NAME) 
url_string = create_url_string(config_run, config)
print(url_string)

In [None]:
def fetch_news(url_string):
    """
    Fetches news, makes sures all the pages are scraped
    """
    response = requests.get(url_string, params = {"page": 1})
    data = response.json()

    total_results = data.get("totalResults", 0)
    articles_data = data.get("articles", [])
    # print(f'total_results: {total_results} | articles_data: {articles_data}')
    
    print(f"Total Results: {total_results}")
    if total_results == 0:
        raise Exception("Exception: No articles captured in current run.")
        return articles_data
        
    if total_results > 100:
        total_pages = math.ceil(total_results / 100)
        for page in range(2, total_pages+1):
            print(f'Going through page {page}/{total_pages}')
            response = requests.get(url_string, params = {"page": page})
            data = response.json()
            
            page_articles = data.get("articles")
            if page_articles is not None: # Avoids Nonetype Object error
                articles_data.extend(page_articles)
            # print(page_articles)
            
    if len(articles_data) != total_results:
        warnings.warn(f'Article Count Mismatch" {len(articles_data)}|{total_results}')

    print(f'No. of Articles: {len(articles_data)}')
    return articles_data

print(url_string)
articles_data = fetch_news(url_string)
print(articles_data)

In [None]:
def articles_to_df(articles_data):
    def get_domain(url: str) -> str:
        """Extract domain from URL (finance.yahoo.com, etc.)."""
        if not url or pd.isna(url):
            return None
        parsed = urlparse(url)
        return parsed.netloc.lower()

    # Main
    df = pd.json_normalize(articles_data, sep="_") #Flatten and separate by
    column_rename_mapping = {
        "urlToImage": "url_to_image",
        "publishedAt": "published_at_utc",
        "content": "content_truncated"
    }
    df = df.rename(columns=column_rename_mapping)
    df["url_domain"] = df["url"].apply(get_domain)
    df["published_at_utc"] = pd.to_datetime(df["published_at_utc"], utc=True).dt.tz_localize(None)
    return df

df = articles_to_df(articles_data)
df.head(3)

In [None]:
def test_ensure_network_rule_for_domain(
    df, 
    nr_name = "news_domains_nr", 
    integration_name = "news_domains_integration",
    schema = "UTILS"):
    """
    Create or replace a single Snowflake network rule that covers all given domains.
    Attach it to the API integration.
    """
    conn = get_snowlfake_conn(schema_name=schema)
    cur = conn.cursor() #Start

    cur.execute(f"DESCRIBE NETWORK RULE SIGNAL_EXTRACTION_DB.{schema}.{nr_name}") #Hard Coded and saved in utils schema.
    desc_rows = cur.fetchall()
    desc_list = list(desc_rows[0])
    
    for idx, val in enumerate(desc_list):
        if ".com" in str(val).lower():
            value_list_str_existing = val.lower()
    print(value_list_str_existing)
            
    value_list_existing = [d.strip().strip("'").lower() for d in value_list_str_existing.split(",")]
    value_list_latest = [i.lower() for i in list(df['url_domain'].unique())]
    value_list_optimised = list(set(value_list_existing + value_list_latest))
    value_list_str_optimised = ", ".join([f"'{d}'" for d in value_list_optimised])
    print(f'Existing Value List ({len(value_list_existing)}): {value_list_existing}')
    print(f'Latest Value List ({len(value_list_latest)}): {value_list_latest}')
    print(f'Optimised Value List ({len(value_list_optimised)}): {value_list_optimised}')
    print(f'NR String: {value_list_str_optimised}')

    # Network Rule SQL
    create_sql = f"""
    CREATE OR REPLACE NETWORK RULE {nr_name}
        TYPE = HOST_PORT
        MODE = EGRESS
        VALUE_LIST = ({value_list_str_optimised});
    """
    cur.execute(create_sql)
    
    # Collect all network rules and re-attach to integration (Both alter & replace sql are having caching issues on re-attaching)
    alter_sql = f"""
    ALTER EXTERNAL ACCESS INTEGRATION {integration_name}
        SET ALLOWED_NETWORK_RULES = ({nr_name})
        ENABLED = TRUE;
    """
    # create or replace sql
    replace_sql = f"""
    CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION {integration_name}
        ALLOWED_NETWORK_RULES = (
            {nr_name}
        )
        ENABLED = TRUE;
    """
    
    # cur.execute(replace_sql)
    cur.execute(alter_sql)
    cur.close() #End
    
    print(f"Network rule ({nr_name}) updated in Integration ({integration_name})")

test_ensure_network_rule_for_domain(df)
    

In [None]:
# ERROR: Too many network rules.

# def ensure_network_rules_for_domains(df, 
#                                      base_nr_name="news_domains_nr", 
#                                      integration_name="news_domains_integration",
#                                      schema="UTILS",
#                                      max_domains_per_rule=50):
#     """
#     Fully rerunnable, fault-tolerant function to manage Snowflake network rules.
#     - Checks all rules in the schema to collect existing domains.
#     - Deduplicates with new domains from df.
#     - Splits into multiple rules if total domains exceed max_domains_per_rule.
#     - Attaches all rules (fully qualified) back to the integration.
#     """
#     conn = get_snowlfake_conn(schema_name=schema)
#     cur = conn.cursor()

#     # 1️⃣ Fetch all network rules in the schema
#     try:
#         cur.execute(f"SHOW NETWORK RULES IN SCHEMA {schema}")
#         rules_info = cur.fetchall()
#         existing_rule_names = [row[1] for row in rules_info]  # rule name in 2nd column
#         print(f"Existing rules in schema {schema}: {existing_rule_names}")
#     except Exception as e:
#         print(f"Error fetching rules in schema {schema}: {e}")
#         existing_rule_names = []

#     # 2️⃣ Collect all existing domains from all rules
#     existing_domains = set()
#     for nr_name in existing_rule_names:
#         try:
#             cur.execute(f"DESCRIBE NETWORK RULE {schema}.{nr_name}")
#             desc_rows = cur.fetchall()
#             desc_list = list(desc_rows[0])
#             for val in desc_list:
#                 if ".com" in str(val).lower():
#                     domains = [d.strip().strip("'").lower() for d in val.split(",")]
#                     existing_domains.update(domains)
#         except Exception as e:
#             print(f"Warning: could not read rule {nr_name}: {e}")

#     print(f"Domains already covered: {len(existing_domains)}")

#     # 3️⃣ Collect latest unique domains from df
#     latest_domains = set([i.lower() for i in df['url_domain'].unique()])

#     # 4️⃣ Merge and deduplicate
#     all_domains = list(existing_domains.union(latest_domains))
#     print(f"Total domains to cover after merge: {len(all_domains)}")

#     # 5️⃣ Split into chunks and prepare rules
#     rules_sql_list = []
#     rule_names = []
#     num_chunks = (len(all_domains) + max_domains_per_rule - 1) // max_domains_per_rule
#     for i in range(num_chunks):
#         chunk = all_domains[i*max_domains_per_rule : (i+1)*max_domains_per_rule]
#         nr_name = f"{base_nr_name}_{i+1}" if num_chunks > 1 else base_nr_name
#         value_list_str = ", ".join([f"'{d}'" for d in chunk])
#         create_sql = f"""
#         CREATE OR REPLACE NETWORK RULE {schema}.{nr_name}
#             TYPE = HOST_PORT
#             MODE = EGRESS
#             VALUE_LIST = ({value_list_str});
#         """
#         rules_sql_list.append(create_sql)
#         rule_names.append(f"{schema}.{nr_name}")  # fully qualified for integration

#     # 6️⃣ Execute creation / update of rules
#     for sql in rules_sql_list:
#         try:
#             cur.execute(sql)
#             print(f"Network rule executed: {sql.splitlines()[1].strip()}")
#         except Exception as e:
#             print(f"Error creating/updating network rule: {e}")

#     # 7️⃣ Attach all rules to integration
#     allowed_rules_str = ", ".join(rule_names)
#     alter_sql = f"""
#     ALTER EXTERNAL ACCESS INTEGRATION {integration_name}
#         SET ALLOWED_NETWORK_RULES = ({allowed_rules_str})
#         ENABLED = TRUE;
#     """
#     try:
#         cur.execute(alter_sql)
#         print(f"Integration ({integration_name}) updated with rules: {allowed_rules_str}")
#     except Exception as e:
#         print(f"Error updating integration: {e}")

#     cur.close()
#     print("Network rule update completed successfully.")

# ensure_network_rules_for_domains(df)


In [None]:
def scrape_url_helper(url):
    try:
        response = requests.get(url, timeout=10)
        # if response.status_code != 200:
        #     return None, 0
            
        soup = BeautifulSoup(response.text, 'html.parser')

        # title = soup.title.string.strip() if soup.title else "" # Extract title
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] # Extract all paragraph text
        content = " ".join(paragraphs)
        
        # paragraphs = soup.find_all("p") #[GPT Method] Keeps formatting intact (other method, removes additional newlines and spaces)
        # content = " ".join([p.get_text() for p in paragraphs if p.get_text()])
        # content = content.strip()
        print(f'{url} - content')
        return content, len(content)
    except requests.exceptions.Timeout as e:
        # Handle connection timeout specifically
        print(f"TIMEOUT for url {url}: {e}")
        return None, -2
    except Exception as e:
        # If error (broken link, paywall, etc.)
        print(f"ERROR for url {url}: {type(e).__name__} — {e}")
        return None, -1 #(-1 to indicate that domain network rule not created)

def scrape_url(df, chunk_size, delay_between_chunks):
    df['content_full'] = None
    df['content_size'] = 0
    df_len = len(df)

    for idx, row in df.iterrows():
        rand_delay_between_chunks = random.uniform(0.5, 1)
        url = row.get('url')
        if not url:
            continue
        content_full, content_size = scrape_url_helper(url)
        print(f'{idx+1}/{df_len} | Scraped url({content_size}): {url}')
        df.at[idx, 'content_full'] = content_full
        df.at[idx, 'content_size'] = content_size
        if idx%chunk_size == 0:
            print(f'{idx+1}/{df_len} | Sleeping for {rand_delay_between_chunks}s')
            time.sleep(rand_delay_between_chunks)
    return df
    

df = scrape_url(df, chunk_size=10, delay_between_chunks=3)
df.head(20)

## Pushing to RAW Schema

In [None]:
def preprocess_dataframe(df):
    """
    Creating the primary key hash & ingested_at value.
    """
    df = df.copy()
    df["article_id"] = df.apply(
        lambda row: hashlib.sha1(f"{row['url']}_{row['published_at_utc']}".encode()).hexdigest(),
        axis=1
    )
    # "ingested_at" is re-written when pushing to prod raw using SQL
    df["ingested_at"] = pd.Timestamp.now(tz="UTC").tz_localize(None)
    config_run = get_config_run(GLOBAL_RUN_NAME)
    # df["entity_name"] = config_run["entity_name"]
    expected_cols = [
        "article_id","author","title","description","url","url_to_image",
        "published_at_utc","content_truncated","source_id","source_name",
        "url_domain","content_full","content_size","ingested_at"
    ]
    return df[expected_cols]

df = preprocess_dataframe(df)
df.head(3)

In [None]:
# Creating a Temp view & table - Session Scoped
def create_temp_table_from_df(df, temp_table="NEWS_ARTICLES_TEMP"):
    """
    Creating a Temp Table (Session Scoped), can be used within the notebook.
    """

    conn = get_snowlfake_conn(schema_name="RAW")
    cur = conn.cursor()

    try:
        # Drop old temp table if exists
        cur.execute(f"DROP TABLE IF EXISTS {temp_table}")

        # Write dataframe to a temp table
        success, nchunks, nrows, _ = write_pandas(
            conn, 
            df, 
            table_name=temp_table, 
            auto_create_table=True, 
            overwrite=True, 
            quote_identifiers=False,
            use_logical_type=True
        )

        print(f"[INFO] Temp table created: {temp_table}, Rows inserted: {nrows}")
    
    finally:
        cur.close()
        conn.close()
    
create_temp_table_from_df(df, temp_table="NEWS_ARTICLES_TEMP")

In [None]:
SELECT * FROM NEWS_ARTICLES_TEMP LIMIT 3;

In [None]:
CREATE TABLE IF NOT EXISTS SIGNAL_EXTRACTION_DB.RAW.NEWS_ARTICLES (
    article_id STRING PRIMARY KEY,
    author STRING,
    title STRING,
    description STRING,
    url STRING,
    url_to_image STRING,
    published_at_utc TIMESTAMP_NTZ,
    content_truncated STRING,
    source_id STRING,
    source_name STRING,
    url_domain STRING,
    content_full STRING,
    content_size NUMBER,
    ingested_at TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP
);

In [None]:
// Still update if aritcle_id already exists, incase the latest ingestion has some updated fields or values.

MERGE INTO SIGNAL_EXTRACTION_DB.RAW.NEWS_ARTICLES AS target
USING NEWS_ARTICLES_TEMP AS source
ON target.article_id = source.article_id
WHEN MATCHED THEN UPDATE SET
    -- target.entity_name       = source.entity_name,
    target.author            = source.author,
    target.title             = source.title,
    target.description       = source.description,
    target.url               = source.url,
    target.url_to_image      = source.url_to_image,
    target.published_at_utc  = source.published_at_utc,
    target.content_truncated = source.content_truncated,
    target.source_id         = source.source_id,
    target.source_name       = source.source_name,
    target.url_domain        = source.url_domain,
    target.content_full      = source.content_full,
    target.content_size      = source.content_size,
    target.ingested_at       = CURRENT_TIMESTAMP
WHEN NOT MATCHED THEN
    INSERT (
        -- entity_name,
        article_id, author, title, description, url,
        url_to_image,published_at_utc, content_truncated,
        source_id, source_name, url_domain, 
        content_full, content_size, ingested_at
    )
    VALUES (
        -- source.entity_name,
        source.article_id, source.author, source.title, 
        source.description, source.url,
        source.url_to_image, source.published_at_utc,
        source.content_truncated, source.source_id,
        source.source_name, source.url_domain,
        source.content_full, source.content_size, CURRENT_TIMESTAMP
    );


In [None]:
SELECT * FROM SIGNAL_EXTRACTION_DB.RAW.NEWS_ARTICLES LIMIT 3;

## Future Developement

In [None]:
-- TODO: Future, map all the article_id s to different entities like dependent market news and others.

-- insert_sql = """
-- INSERT INTO RAW.ARTICLE_ENTITY_MAP (article_id, q_value, entity_name)
-- SELECT s.article_id, s.q_value, s.entity_name
-- FROM NEWS_ARTICLES_TEMP s
-- WHERE NOT EXISTS (
--     SELECT 1
--     FROM RAW.ARTICLE_ENTITY_MAP m
--     WHERE m.article_id  = s.article_id
--       AND m.q_value     = s.q_value
--       AND m.entity_name = s.entity_name
-- );
-- """