In [10]:
import os, json, time, datetime as dt, csv, pathlib
from typing import Dict, List
import requests
import pandas as pd
import bs4 as BeautifulSoup
from dotenv import load_dotenv
import yfinance as yf

In [11]:
DATA_PATH = pathlib.Path('../data/raw')
DATA_PATH.mkdir(parents=True, exist_ok=True)
load_dotenv()
API_Key = os.getenv('ALPHAVANTAGE_API_KEY')
print("Your key is Imported :" , bool(API_Key))

Your key is Imported : True


In [12]:
def time_stamp():
    return dt.datetime.now().strftime('%m%d%Y-%H%M%S')

In [13]:
def save_csv_file(df:pd.DataFrame,prefix:str,meta:Dict[str,str]):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    return f"{prefix}_{mid}_{time_stamp()}.csv"

In [14]:
def validate(df:pd.DataFrame, required_columns):
    missing = [c for c in required_columns if c not in df.columns]
    return {"Missing columns": missing, 'Dataframe_Shape':df.shape,'Na_total':int(df.isna().sum().sum())}

In [15]:
SYMBOL = 'TSLA'
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))
if USE_ALPHA:
    url = 'https://www.alphavantage.co/query'
    params = {'function':'TIME_SERIES_MONTHLY','symbol':SYMBOL,'apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    ts_key = [k for k in js.keys() if "Time Series" in k]
    time_series = js[ts_key[0]]
    df_api = pd.DataFrame.from_dict(time_series, orient="index")
    df_api.columns = [col.split(". ")[1] if ". " in col else col for col in df_api.columns]
    df_api.index = pd.to_datetime(df_api.index)
    df_api = df_api.reset_index().rename(columns={'index': 'date'})
else:
    import yfinance as yf
    df_api = yf.download(SYMBOL, period='max', interval='1mo')
    # df_api.columns = ['date','adj_close']

validate_api = validate(df_api, ['date', 'open', 'high', 'low', 'close'])
print(validate_api)
fname = save_csv_file(df_api, "api", {"source": "alpha" if USE_ALPHA else "yfinance", "symbol": SYMBOL})
out_path = DATA_PATH / fname
df_api.to_csv(out_path, index=False)
print("Saved:", out_path)



{'Missing columns': [], 'Dataframe_Shape': (182, 6), 'Na_total': 0}
Saved: ../data/raw/api_source-alpha_symbol-TSLA_08202025-195121.csv


In [16]:
from bs4 import BeautifulSoup


In [17]:
SCRAPE_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"  # replace with permitted page
headers = {"User-Agent": "AFE-Course-Notebook/1.0"}
try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    table = soup.find('table')
    rows = []
    for tr in table.find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
        if cells:
            rows.append(cells)
    # assume first row is header
    header, *data = rows
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print("Scrape failed (demoing with inline HTML).", e)
    html = """
    <table>
      <tr><th>Ticker</th><th>Price</th></tr>
      <tr><td>AAA</td><td>101.2</td></tr>
      <tr><td>BBB</td><td>98.7</td></tr>
    </table>
    """
    soup = BeautifulSoup(html, 'html.parser')
    rows = []
    for tr in soup.find_all('tr'):
        cells = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
        if cells:
            rows.append(cells)
    header, *data = rows
    df_scrape = pd.DataFrame(data, columns=header)

if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')



fname2 = save_csv_file(df_scrape,prefix="scrape", meta={"site": "example", "table": "markets"})
out_path2 = DATA_PATH / fname2
df_scrape.to_csv(out_path2, index=False)
print("Saved:", out_path2)

Saved: ../data/raw/scrape_site-example_table-markets_08202025-195121.csv
