# NEWS API Ingestion (Dynamic)

## Runtime Config Set-Up

In [None]:
CONFIG_PATH = "config.json"
RUN_NAME = "PalantirTestRun" #<TODO: Make Dynamic>

### Importing Libraries

In [None]:
import requests
import json
import pandas as pd
from datetime import date, timedelta, datetime
import math
import warnings
from bs4 import BeautifulSoup
import time
import snowflake.connector
from urllib.parse import urlparse

### Testing Endpoint

In [None]:
resp = requests.get('https://newsapi.org/v2/everything?q=apple&to=2025-09-10&from=2025-09-10&apiKey=c494f280427646c78473013990b3cd45')
len(resp.json())

## Main Code

In [None]:
def load_config(CONFIG_PATH="config.json", RUN_NAME="PalantirTestRun"):
    with open(CONFIG_PATH) as f:
        config = json.load(f)
    config_run = next((r for r in config['runs'] if r['run_name'] == RUN_NAME), None)
    return config, config_run

config, config_run = load_config() #DEBUG
print(config, config_run)

In [None]:
def create_url_string(config_run, config):
    calc_to = date.today()
    calc_from = calc_to - timedelta(days=config_run["from_days_ago"])
    calc_from = calc_from.isoformat()
    
    url_string = f"""
        {config_run["endpoint"]}?
        q={config_run["q"]}&
        language={config_run["language"]}&
        to={calc_to}&
        from={calc_from}&
        sortBy={config_run["sortBy"]}&
        apiKey={config["news_api_key"]}
    """
    url_string = url_string.replace(' ', '').replace('\n', '')
    return url_string

url_string = create_url_string(config_run, config)
print(url_string)

In [None]:
def fetch_news(url_string):
    """
    Fetches news, makes sures all the pages are scraped
    """
    response = requests.get(url_string, params = {"page": 1})
    data = response.json()

    total_results = data.get("totalResults", 0)
    articles_data = data.get("articles", [])
    # print(f'total_results: {total_results} | articles_data: {articles_data}')
    
    if total_results == 0:
        return articles_data
        
    if total_results > 100:
        total_pages = math.ceil(total_results / 100)
        for page in range(2, total_pages+1):
            print(f'Going through page {page}/{total_pages}')
            response = requests.get(url_string, params = {"page": page})
            data = response.json()
            
            page_articles = data.get("articles")
            articles_data.extend(page_articles)
            # print(page_articles)
            
    if len(articles_data) != total_results:
        warnings.warn(f'Article Count Mismatch" {len(articles_data)}|{total_results}')

    print(f'No. of Articles: ')
    return articles_data

print(url_string)
articles_data = fetch_news(url_string)
print(articles_data)

In [None]:
def articles_to_df(articles_data):
    def get_domain(url: str) -> str:
        """Extract domain from URL (finance.yahoo.com, etc.)."""
        if not url or pd.isna(url):
            return None
        parsed = urlparse(url)
        return parsed.netloc.lower()
        
    df = pd.json_normalize(articles_data, sep="_") #Flatten and separate by
    column_rename_mapping = {
        "urlToImage": "url_to_image",
        "publishedAt": "published_at",
        "content": "content_truncated"
    }
    df = df.rename(columns=column_rename_mapping)
    df["url_domain"] = df["url"].apply(get_domain)
    df["published_at"] = pd.to_datetime(df["published_at"], utc=True).dt.tz_localize(None)
    return df

df = articles_to_df(articles_data)
df.head(3)

In [None]:
def ensure_network_rule_for_domain(
    df, 
    nr_name = "news_domains_nr", 
    integration_name = "news_domains_integration"):
    """
    Create or replace a single Snowflake network rule that covers all given domains.
    Attach it to the API integration.
    """
    config, run_config  = load_config()
    config_snowflake = config["snowflake"]
    conn = snowflake.connector.connect(
        user=config_snowflake["user"],
        password=config_snowflake["password"],
        account=config_snowflake["account"],
        warehouse=config_snowflake["warehouse"],
        database=config_snowflake["database"],
        schema="UTILS" # SQL Specific
    )
    cur = conn.cursor() #Start

    cur.execute(f"DESCRIBE NETWORK RULE {nr_name}")
    desc_rows = cur.fetchall()
    desc_list = list(desc_rows[0])
    
    for idx, val in enumerate(desc_list):
        if ".com" in str(val).lower():
            value_list_str_existing = val.lower()
    print(value_list_str_existing)
            
    value_list_existing = [d.strip().strip("'").lower() for d in value_list_str_existing.split(",")]
    value_list_latest = [i.lower() for i in list(df['url_domain'].unique())]
    value_list_optimised = list(set(value_list_existing + value_list_latest))
    value_list_str_optimised = ", ".join([f"'{d}'" for d in value_list_optimised])
    print(f'Existing Value List: {value_list_existing}')
    print(f'Latest Value List: {value_list_latest}')
    print(f'Optimised Value List: {value_list_optimised}')
    print(f'NR String: {value_list_str_optimised}')

    # Network Rule SQL
    create_sql = f"""
    CREATE OR REPLACE NETWORK RULE {nr_name}
        TYPE = HOST_PORT
        MODE = EGRESS
        VALUE_LIST = ({value_list_str_optimised});
    """
    cur.execute(create_sql)
    
    # Collect all network rules and re-attach to integration
    alter_sql = f"""
    ALTER EXTERNAL ACCESS INTEGRATION {integration_name}
        SET ALLOWED_NETWORK_RULES = ({nr_name})
        ENABLED = TRUE;
    """
    cur.execute(alter_sql)
    cur.close() #End
    
    print(f"Network rule ({nr_name}) updated in Integration ({integration_name})")

ensure_network_rule_for_domain(df)
    

In [None]:
def scrape_url_helper(url):
    try:
        response = requests.get(url, timeout=10)
        # if response.status_code != 200:
        #     return None, 0
            
        soup = BeautifulSoup(response.text, 'html.parser')

        # title = soup.title.string.strip() if soup.title else "" # Extract title
        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')] # Extract all paragraph text
        content = " ".join(paragraphs)
        
        # paragraphs = soup.find_all("p") #[GPT Method] Keeps formatting intact (other method, removes additional newlines and spaces)
        # content = " ".join([p.get_text() for p in paragraphs if p.get_text()])
        # content = content.strip()
        print(f'{url} - content')
        return content, len(content)
    except requests.exceptions.Timeout as e:
        # Handle connection timeout specifically
        print(f"TIMEOUT for url {url}: {e}")
        return None, -2
    except Exception as e:
        # If error (broken link, paywall, etc.)
        print(f"ERROR for url {url}: {type(e).__name__} — {e}")
        return None, -1 #(-1 to indicate that domain network rule not created)

def scrape_url(df, chunk_size=30, delay_between_chunks=3):
    df['content_full'] = None
    df['content_size'] = 0
    df_len = len(df)

    for idx, row in df.iterrows():
        url = row.get('url')
        if not url:
            continue
        content_full, content_size = scrape_url_helper(url)
        print(f'{idx+1}/{df_len} | Scraped url({content_size}): {url}')
        df.at[idx, 'content_full'] = content_full
        df.at[idx, 'content_size'] = content_size
        if idx%chunk_size == 0:
            print(f'{idx+1}/{df_len} | Sleeping for {delay_between_chunks}s')
            time.sleep(delay_between_chunks)
    return df
    

df = scrape_url(df)
df.head(20)

### Updating Schema View

In [None]:
df[df["content_size"] == -2]