# Stage04

In [8]:
import os, pathlib, requests
import datetime as dt
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup as bs
load_dotenv()
key = os.getenv("ALPHAVANTAGE_API_KEY")
print("ALPHAVANTAGE_API_KEY Loaded:", key is not None)
RAW = pathlib.Path('../data/raw')

ALPHAVANTAGE_API_KEY Loaded: True


In [9]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

## step 1 API Pull

In [16]:
SYMBOL = "META"
use_alpha = key is not None
if use_alpha:
    url = 'https://www.alphavantage.co/query'
    params = {
        'function': 'TIME_SERIES_DAILY',
        'symbol': SYMBOL,
        'outputsize': 'compact',
        'apikey': key
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    key = [k for k in js.keys() if "Time Series" in k][0]
    assert key, f"Unexpected response keys: {list(js.keys())}"
    df_api = pd.DataFrame(js[key]).T.reset_index().rename(columns={
        'index': 'date',
        '4. close':'close'})[['date', 'close']]
    df_api['date'] = pd.to_datetime(df_api['date'])
    df_api['close'] = pd.to_numeric(df_api['close'])
else:
    import yfinance as yf
    df_api = yf.download(SYMBOL, period='6mo', interval='1d')[['Close']].reset_index()
    df_api.rename(columns={'Date': 'date', 'Close': 'close'}, inplace=True)

validate(df_api, ['date', 'close'])

{'missing': [], 'shape': (100, 2), 'na_total': 0}

## Step 2 Scrape a small table

In [None]:
scrape_url = 'https://en.wikipedia.org/wiki/SSE_50_Index'
headers = {'User-Agent':'AFE-Homework/1.0'}
try:
    resp = requests.get(scrape_url, headers=headers, timeout=30); resp.raise_for_status()
    soup = bs(resp.text, 'html.parser')
    soup = soup.find('table', {'id': 'constituents'}) # scrape a specific table
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = bs(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
v_scrape = validate(df_scrape, list(df_scrape.columns)); v_scrape

{'missing': [], 'shape': (50, 2), 'na_total': 0}

In [36]:
save_csv(df_api,'api', source='alphavantage' if use_alpha else 'yfinance', symbol=SYMBOL)

Saved ../data/raw/api_source-alphavantage_symbol-META_20250818-002545.csv


PosixPath('../data/raw/api_source-alphavantage_symbol-META_20250818-002545.csv')

In [37]:
save_csv(df_scrape, 'scrape', site='wikipedia')

Saved ../data/raw/scrape_site-wikipedia_20250818-002723.csv


PosixPath('../data/raw/scrape_site-wikipedia_20250818-002723.csv')

Assumptions & risks:
* API key exposure risk
* Anti-scraping risk
* Network reliability