In [1]:

from pathlib import Path
from datetime import datetime
import pandas as pd

try:
    import yfinance as yf
except ImportError:
    raise SystemExit("yfinance not found. Install with:  pip install yfinance pandas")

RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("Folders ready:", RAW_DIR.resolve(), PROCESSED_DIR.resolve())

Folders ready: C:\Users\arnav\Homework 4\data\raw C:\Users\arnav\Homework 4\data\processed


In [2]:
TICKER = "TSLA"

USE_PERIOD = False                 
START_DATE = "2015-01-01"
END_DATE = datetime.today().strftime("%Y-%m-%d")
INTERVAL = "1d"                    


if USE_PERIOD:
    range_tag = f"period-{locals().get('PERIOD', 'NA')}"
else:
    range_tag = f"{START_DATE}_{END_DATE}"
BASE_NAME = f"{TICKER}_{range_tag}_{INTERVAL}"

RAW_PATH = RAW_DIR / f"{BASE_NAME}.csv"
PROC_PATH = PROCESSED_DIR / f"{BASE_NAME}_clean.csv"

print("🔧 Plan")
print(f"- Ticker:     {TICKER}")
print(f"- Interval:   {INTERVAL}")
print(f"- Range tag:  {range_tag}")
print(f"- Raw out:    {RAW_PATH}")
print(f"- Processed:  {PROC_PATH}")

🔧 Plan
- Ticker:     TSLA
- Interval:   1d
- Range tag:  2015-01-01_2025-08-21
- Raw out:    data\raw\TSLA_2015-01-01_2025-08-21_1d.csv
- Processed:  data\processed\TSLA_2015-01-01_2025-08-21_1d_clean.csv


In [3]:
def fetch_yahoo_history(
    ticker_symbol: str,
    use_period: bool = False,
    start: str | None = None,
    end: str | None = None,
    period: str | None = None,
    interval: str = "1d",
    include_actions: bool = True,
) -> pd.DataFrame:
    
    ticker = yf.Ticker(ticker_symbol)
    kw = dict(interval=interval, actions=include_actions, auto_adjust=False, raise_errors=True, prepost=False)

    if use_period:
        if not period:
            raise ValueError("When use_period=True, you must provide 'period'.")
        df = ticker.history(period=period, **kw)
    else:
        if not start:
            raise ValueError("Provide 'start' when use_period=False.")
        df = ticker.history(start=start, end=end, **kw)

    if df.empty:
        raise RuntimeError("Empty dataframe returned (check interval/range combo or Yahoo throttling).")

    ordered = ["Open","High","Low","Close","Adj Close","Volume","Dividends","Stock Splits"]
    df = df[[c for c in ordered if c in df.columns]].copy()
    df.index.name = "Datetime"
    return df

In [4]:
try:
    df_raw = fetch_yahoo_history(
        ticker_symbol=TICKER,
        use_period=USE_PERIOD,
        start=START_DATE if not USE_PERIOD else None,
        end=END_DATE if not USE_PERIOD else None,
        period=locals().get("PERIOD") if USE_PERIOD else None,
        interval=INTERVAL,
    )
except Exception as e:
    raise SystemExit(f"[ERROR] Failed to fetch data: {e}")

df_raw_out = df_raw.copy()
if isinstance(df_raw_out.index, pd.DatetimeIndex):
    idx = df_raw_out.index.tz_convert(None) if df_raw_out.index.tz is not None else df_raw_out.index
    df_raw_out.insert(0, "Datetime", idx.strftime("%Y-%m-%d %H:%M:%S"))
    df_raw_out = df_raw_out.reset_index(drop=True)

df_raw_out.to_csv(RAW_PATH, index=False)
print(f" Saved RAW to {RAW_PATH} with shape {df_raw_out.shape}")
df_raw_out.head()

 Saved RAW to data\raw\TSLA_2015-01-01_2025-08-21_1d.csv with shape (2674, 9)


Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,2015-01-02 05:00:00,14.858,14.883333,14.217333,14.620667,14.620667,71466000,0.0,0.0
1,2015-01-05 05:00:00,14.303333,14.433333,13.810667,14.006,14.006,80527500,0.0,0.0
2,2015-01-06 05:00:00,14.004,14.28,13.614,14.085333,14.085333,93928500,0.0,0.0
3,2015-01-07 05:00:00,14.223333,14.318667,13.985333,14.063333,14.063333,44526000,0.0,0.0
4,2015-01-08 05:00:00,14.187333,14.253333,14.000667,14.041333,14.041333,51637500,0.0,0.0


In [5]:
df_proc = df_raw_out.copy()

df_proc["Datetime"] = pd.to_datetime(df_proc["Datetime"], errors="coerce")
df_proc = df_proc.dropna(subset=["Datetime"]).sort_values("Datetime")

cols_order = ["Datetime", "Open", "High", "Low", "Close", "Adj Close", "Volume", "Dividends", "Stock Splits"]
df_proc = df_proc[[c for c in cols_order if c in df_proc.columns]]

df_proc.to_csv(PROC_PATH, index=False)
print(f"Saved PROCESSED to {PROC_PATH} with shape {df_proc.shape}")
df_proc.head()

Saved PROCESSED to data\processed\TSLA_2015-01-01_2025-08-21_1d_clean.csv with shape (2674, 9)


Unnamed: 0,Datetime,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,2015-01-02 05:00:00,14.858,14.883333,14.217333,14.620667,14.620667,71466000,0.0,0.0
1,2015-01-05 05:00:00,14.303333,14.433333,13.810667,14.006,14.006,80527500,0.0,0.0
2,2015-01-06 05:00:00,14.004,14.28,13.614,14.085333,14.085333,93928500,0.0,0.0
3,2015-01-07 05:00:00,14.223333,14.318667,13.985333,14.063333,14.063333,44526000,0.0,0.0
4,2015-01-08 05:00:00,14.187333,14.253333,14.000667,14.041333,14.041333,51637500,0.0,0.0


In [6]:
def summarize(df: pd.DataFrame, name: str = "Data"):
    print(f"\n——— {name} Summary ———")
    print("Rows, Cols:", df.shape)
    if "Datetime" in df.columns:
        print("Date range:", df["Datetime"].min(), "→", df["Datetime"].max())
    
    miss = df.isna().mean().sort_values(ascending=False)
    print("\nMissingness (fraction):")
    print(miss.to_string())

    num_cols = df.select_dtypes(include="number").columns.tolist()
    if num_cols:
        print("\nNumeric describe:")
        display(df[num_cols].describe())

summarize(df_raw_out, "RAW CSV View")
summarize(df_proc, "PROCESSED")


——— RAW CSV View Summary ———
Rows, Cols: (2674, 9)
Date range: 2015-01-02 05:00:00 → 2025-08-20 04:00:00

Missingness (fraction):
Datetime        0.0
Open            0.0
High            0.0
Low             0.0
Close           0.0
Adj Close       0.0
Volume          0.0
Dividends       0.0
Stock Splits    0.0

Numeric describe:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
count,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0
mean,127.64361,130.489842,124.602415,127.606686,127.606686,111894500.0,0.0,0.002992
std,121.320284,124.060564,118.292278,121.199478,121.199478,72439840.0,0.0,0.112742
min,9.488,10.331333,9.403333,9.578,9.578,10620000.0,0.0,0.0
25%,17.590333,17.836833,17.258667,17.589333,17.589333,67655400.0,0.0,0.0
50%,52.809334,54.377001,51.211668,53.264,53.264,93256050.0,0.0,0.0
75%,234.155003,239.212505,229.322495,235.042507,235.042507,129386000.0,0.0,0.0
max,475.899994,488.540009,457.51001,479.859985,479.859985,914082000.0,0.0,5.0



——— PROCESSED Summary ———
Rows, Cols: (2674, 9)
Date range: 2015-01-02 05:00:00 → 2025-08-20 04:00:00

Missingness (fraction):
Datetime        0.0
Open            0.0
High            0.0
Low             0.0
Close           0.0
Adj Close       0.0
Volume          0.0
Dividends       0.0
Stock Splits    0.0

Numeric describe:


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
count,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0,2674.0
mean,127.64361,130.489842,124.602415,127.606686,127.606686,111894500.0,0.0,0.002992
std,121.320284,124.060564,118.292278,121.199478,121.199478,72439840.0,0.0,0.112742
min,9.488,10.331333,9.403333,9.578,9.578,10620000.0,0.0,0.0
25%,17.590333,17.836833,17.258667,17.589333,17.589333,67655400.0,0.0,0.0
50%,52.809334,54.377001,51.211668,53.264,53.264,93256050.0,0.0,0.0
75%,234.155003,239.212505,229.322495,235.042507,235.042507,129386000.0,0.0,0.0
max,475.899994,488.540009,457.51001,479.859985,479.859985,914082000.0,0.0,5.0
