### Data Loader

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
import os
import json

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
def load_master_transaction_log(log_path):
    print("--- Loading Master Transaction Log ---")
    if not os.path.exists(log_path):
        raise FileNotFoundError(
            f"Master transaction log not found at '{log_path}'. "
        )

    master_log = pd.read_csv(log_path, parse_dates=["Date"])
    print(f"Loaded {os.path.basename(log_path)}")
    return master_log.set_index('Date')

Look into numerical stability of dividend aggregation

In [3]:
master_log = load_master_transaction_log('data/master_transaction_log.csv')
master_log

--- Loading Master Transaction Log ---
Loaded master_transaction_log.csv


Unnamed: 0_level_0,Type,Symbol,Quantity,Price,Amount,Commission,Currency,Description,Source
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-02-16,Net Deposit,,,,271.57,,USD,ID: 37fdafdc-d707-42fa-ba74-98b30cf9ab2a - DT2...,Brokerage Statement
2023-02-17,buy,VOO,0.218967,372.070000,-81.47,,USD,Trade Entry,Brokerage Statement
2023-02-17,buy,TSLA,0.202666,201.020000,-40.74,,USD,Trade Entry,Brokerage Statement
2023-02-17,buy,PERI,0.407270,33.340000,-13.58,,USD,Trade Entry,Brokerage Statement
2023-03-29,Net Dividend,VOO,,,0.24,,USD,"Cash DIV @ 1.4874, Pos QTY: 0.218966527, Rec D...",Brokerage Statement
...,...,...,...,...,...,...,...,...,...
2025-07-24,buy,BTC-USD,0.000162,120615.384615,-19.60,,USD,,Crypto CSV
2025-07-24,buy,BRK-B,0.124000,481.935484,-59.76,1.0,USD,Purchase of BRK-B,Interim CSV
2025-07-24,buy,VOOG,0.244500,408.997955,-100.00,1.0,USD,Purchase of VOOG,Interim CSV
2025-07-24,buy,APO,0.261400,153.022188,-40.00,1.0,USD,Purchase of APO,Interim CSV


### Caching

In [4]:
def yfinance_hist(
    ticker_symbol, start_date, end_date, last_market_day, cache_dir="data/yf_cache"
):
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    cache_file = os.path.join(cache_dir, f"{ticker_symbol}.csv")

    if os.path.exists(cache_file):
        cached_data = pd.read_csv(cache_file, index_col="Date", parse_dates=True)
        if (
            not cached_data.empty
            and cached_data.index.max().normalize() >= last_market_day
        ):
            return cached_data

    ticker = yf.Ticker(ticker_symbol)
    hist = ticker.history(start=start_date, end=end_date)

    if not hist.empty:
        hist.index = hist.index.tz_localize(None)
        hist.to_csv(cache_file)

    return hist


def ticker_info(
    ticker_symbol, cache_file="data/yf_cache/ticker_info_cache.json"
):
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            info_cache = json.load(f)
    else:
        info_cache = {}

    if ticker_symbol in info_cache:
        return info_cache[ticker_symbol]

    ticker = yf.Ticker(ticker_symbol)
    info = ticker.info

    info_cache[ticker_symbol] = info

    os.makedirs(os.path.dirname(cache_file), exist_ok=True)
    with open(cache_file, "w") as f:
        json.dump(info_cache, f, indent=4)

    return info

### Main - Variables

In [5]:
start_date = master_log.index.min()
end_date = pd.Timestamp.today().normalize() - pd.Timedelta(days=1)
date_range = pd.Series(pd.date_range(start=start_date, end=end_date, freq='D'), name='Date')

benchmark_index = 'VOO'
TAX_RATE = 0.30

last_market_day = (
    yf.Ticker(benchmark_index).history(period="5d").index.max().tz_localize(None).normalize()
)

### Benchmark

In [6]:
def buy_order(cash_to_invest, FLAT_FEE=1.0, RATE=0.0025):
    if cash_to_invest <= FLAT_FEE:
        return 0.0, 0.0

    net_investment = cash_to_invest / (1 + RATE)

    if net_investment >= FLAT_FEE / RATE:
        commission = cash_to_invest - net_investment
    else:
        commission = FLAT_FEE
        net_investment = cash_to_invest - commission

    return net_investment, commission

def sell_order(cash_needed, FLAT_FEE=1.0, RATE=0.0025):
    if cash_needed <= 0:
        return 0.0, 0.0

    if cash_needed > (FLAT_FEE / RATE) - FLAT_FEE:
        gross_sale = cash_needed / (1 - RATE)
        commission = gross_sale - cash_needed
    else:
        commission = FLAT_FEE
        gross_sale = cash_needed + commission

    return gross_sale, commission

In [7]:
benchmark = pd.DataFrame(index=date_range)

yfinance_history = yfinance_hist(benchmark_index, start_date, end_date, last_market_day)
yfinance_history['Market'] = 'Open'

benchmark = yfinance_history.reindex(date_range)
fill_cols = [col for col in ['Open', 'High', 'Low', 'Close', 'Volume'] if col in benchmark.columns]
benchmark[fill_cols] = benchmark[fill_cols].ffill()
fill_values = {'Market': 'Closed', 'Dividends': 0, 'Stock Splits': 0, 'Capital Gains': 0}
fill_values = {k: v for k, v in fill_values.items() if k in benchmark.columns}
benchmark = benchmark.fillna(value=fill_values)

benchmark['Net Deposit'] = master_log[master_log['Type'] == 'Net Deposit']['Amount']

benchmark['Shares'] = 0.0
benchmark['DividendCash'] = 0.0
benchmark['TradeCash'] = 0.0
benchmark['Commission'] = 0.0
benchmark['Portfolio Value'] = 0.0
benchmark['Total Value'] = 0.0
benchmark['Trade Trigger'] = 'None'
benchmark['NetDividend'] = 0.0

initial_deposit_index = benchmark['Net Deposit'].first_valid_index()
if initial_deposit_index is not None:
    benchmark.loc[initial_deposit_index, 'TradeCash'] = benchmark.loc[initial_deposit_index, 'Net Deposit']
    benchmark.loc[initial_deposit_index, 'Trade Trigger'] = 'Buy'

for i in range(1, len(benchmark)):
    today = benchmark.index[i]
    yesterday = benchmark.index[i-1]

    benchmark.loc[today, 'Shares'] = benchmark.loc[yesterday, 'Shares']
    benchmark.loc[today, 'DividendCash'] += benchmark.loc[yesterday, 'DividendCash']
    benchmark.loc[today, 'TradeCash'] += benchmark.loc[yesterday, 'TradeCash']
    benchmark.loc[today, 'Trade Trigger'] = benchmark.loc[yesterday, 'Trade Trigger']

    if benchmark.loc[today, 'Dividends'] > 0 and benchmark.loc[yesterday, 'Shares'] > 0:
        net_dividend = benchmark.loc[today, 'Dividends'] * benchmark.loc[yesterday, 'Shares'] * (1 - TAX_RATE)
        benchmark.loc[today, "NetDividend"] = net_dividend
        benchmark.loc[today, 'DividendCash'] += net_dividend

    if pd.notna(benchmark.loc[today, 'Net Deposit']):
        deposit_amount = benchmark.loc[today, 'Net Deposit']
        benchmark.loc[today, 'TradeCash'] += deposit_amount
        if deposit_amount > 0:
            benchmark.loc[today, 'Trade Trigger'] = 'Buy'
        elif deposit_amount < 0:
            benchmark.loc[today, 'Trade Trigger'] = 'Sell'

    if benchmark.loc[today, 'Market'] == 'Open' and benchmark.loc[today, 'Trade Trigger'] != 'None':
        trigger_type = benchmark.loc[today, 'Trade Trigger']
        open_price = benchmark.loc[today, 'Open']

        if trigger_type == 'Buy':
            cash_to_invest = benchmark.loc[today, 'TradeCash'] + benchmark.loc[today, 'DividendCash']
            if cash_to_invest > 1.0:
                net_investment, commission = buy_order(cash_to_invest)
                shares_bought = net_investment / open_price
                benchmark.loc[today, 'Shares'] += shares_bought
                benchmark.loc[today, 'Commission'] = commission
                benchmark.loc[today, 'TradeCash'] = 0.0
                benchmark.loc[today, 'DividendCash'] = 0.0

            benchmark.loc[today, 'Trade Trigger'] = 'None'

        elif trigger_type == 'Sell':
            cash_needed = abs(benchmark.loc[today, 'Net Deposit'])

            cash_from_dividends = min(cash_needed, benchmark.loc[today, 'DividendCash'])
            benchmark.loc[today, 'DividendCash'] -= cash_from_dividends
            
            cash_needed_from_sale = cash_needed - cash_from_dividends

            if cash_needed_from_sale > 0:
                gross_sale, commission = sell_order(cash_needed_from_sale)
                shares_to_sell = gross_sale / open_price
                shares_sold = min(shares_to_sell, benchmark.loc[today, 'Shares'])
                cash_raised = (shares_sold * open_price) - commission
                
                benchmark.loc[today, 'TradeCash'] += cash_raised
                benchmark.loc[today, 'Shares'] -= shares_sold
                benchmark.loc[today, 'Commission'] = commission

            benchmark.loc[today, 'Trade Trigger'] = 'None'

    total_cash = benchmark.loc[today, 'TradeCash'] + benchmark.loc[today, 'DividendCash']
    benchmark.loc[today, 'Portfolio Value'] = benchmark.loc[today, 'Shares'] * benchmark.loc[today, 'Close']
    benchmark.loc[today, 'Total Value'] = benchmark.loc[today, 'Portfolio Value'] + total_cash

### Portfolio Analysis

In [8]:
def cumulative_split_factors(split_series: pd.Series) -> pd.Series:
    """
    Compute cumulative split factors for retroactive holding adjustment.
    A split on day t affects only days < t.
    """
    factors = split_series.replace(0, 1)
    cumulative = factors[::-1].ffill().cumprod()[::-1].shift(-1)
    return cumulative.fillna(1.0)

In [9]:
symbols = np.sort(master_log['Symbol'].dropna().unique())

holdings = {}

for name in ["trade", "holding", "stock split", "adj holding", "price", "value"]:
    holdings[name] = pd.DataFrame(
        0.0, index=date_range, columns=symbols, dtype="float64"
    )

holdings['trade'].update(
    master_log[master_log['Type'].isin(['buy', 'sell', 'Merger/Acquisition'])].groupby(['Date', 'Symbol'])['Quantity'].sum().unstack(fill_value=0)
)

raw_splits = pd.DataFrame(index=date_range, columns=symbols)
for symbol in symbols:
    hist = yfinance_hist(symbol, start_date, end_date, last_market_day)
    raw_splits[symbol] = hist["Stock Splits"]
    
raw_splits = raw_splits.fillna(0.0)

for symbol in symbols:
    for i in range(len(date_range)):
        current_date = date_range[i]
        
        prev_holding = holdings["holding"].iloc[i - 1][symbol] if i > 0 else 0
        
        split_ratio = raw_splits.loc[current_date, symbol]
        if split_ratio == 0:
            split_ratio = 1
        
        todays_holding_after_split = prev_holding * split_ratio
        
        final_holding_today = todays_holding_after_split + holdings["trade"].loc[current_date, symbol]
        
        holdings["holding"].loc[current_date, symbol] = final_holding_today

holdings["holding"] = holdings["holding"]

In [10]:
symbols_df = pd.DataFrame({
    'Symbol': symbols,
    'Name': [ticker_info(name)['longName'] for name in symbols]
})

holdings["Ticker Info"] = {}

for symbol in symbols:
    try:
        if symbol == benchmark_index:
            hist = benchmark
        else:
            hist = yfinance_hist(symbol, start_date, end_date, last_market_day)

        holdings["price"][symbol] = hist["Close"]
        holdings["price"][symbol] = holdings["price"][symbol].ffill()
        holdings["stock split"][symbol] = hist["Stock Splits"]
        holdings["stock split"][symbol] = holdings["stock split"][symbol].fillna(0)
        holdings["stock split"][symbol] = cumulative_split_factors(holdings["stock split"][symbol])
        holdings["adj holding"][symbol] = holdings["holding"][symbol] * holdings["stock split"][symbol]

        info = ticker_info(symbol)
        holdings["Ticker Info"][symbol] = {}
        holdings["Ticker Info"][symbol]["Industry"] = info.get("industryDisp", "N/A")
        holdings["Ticker Info"][symbol]["Sector"] = info.get("sectorDisp", "N/A")

        symbols_df.loc[symbols_df["Symbol"] == symbol, "yf_symbol"] = info.get("symbol")
        symbols_df.loc[symbols_df["Symbol"] == symbol, "yf_name"] = info.get("longName")

    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")

holdings["value"] = holdings["adj holding"] * holdings["price"]
holdings["Portfolio Value"] = pd.DataFrame(index=date_range)
holdings["Portfolio Value"]["Value"] = holdings["value"].sum(axis=1)

In [11]:
holdings['adj holding'].iloc[-1]

ADBE    6.938894e-18
AMZN    1.499798e-01
APO     5.347289e-01
ARKK    0.000000e+00
ARM     8.082593e-02
            ...     
VOOG    7.229965e-01
VXUS    7.168100e-01
XLF     1.309464e+00
XLK     1.223873e+00
XLV     2.891891e-01
Name: 2025-08-01 00:00:00, Length: 62, dtype: float64

### Visualization

In [12]:
monthly_deposits = master_log[master_log['Type'] == "Net Deposit"]['Amount'].resample("ME").sum()
monthly_income = master_log[master_log['Type'].isin(['Net Dividend', 'Credit/Margin Interest'])]['Amount'].resample("ME").sum()
monthly_benchmark_income = benchmark["NetDividend"].resample("ME").sum()

In [13]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(
        x=holdings["Portfolio Value"].index,
        y=holdings["Portfolio Value"]["Value"],
        mode="lines",
        name="Personal Portfolio",
        line=dict(color="green", width=2),
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=benchmark.index,
        y=benchmark["Portfolio Value"],
        mode="lines",
        name="S&P500 Benchmark",
        line=dict(color="red", width=2),
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=benchmark.index,
        y=benchmark["Net Deposit"].fillna(0).cumsum(),
        mode="lines",
        name="Cumulative Net Deposits",
        line=dict(color="darkgrey", width=1),
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(
        x=monthly_deposits.index,
        y=monthly_deposits,
        name="Deposits / Withdrawals",
        marker_color="royalblue",
        opacity=0.3,
    ),
    secondary_y=True,
)

fig.update_layout(
    title_text="<b>Portfolio Performance vs. S&P500 Benchmark</b>",
    template="plotly_white",
    barmode="relative",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    xaxis=dict(
        title="Date",
        rangeselector=dict(
            buttons=list(
                [
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all"),
                ]
            )
        ),
        rangeslider=dict(visible=True),
        type="date",
    ),
)

fig.update_yaxes(title_text="<b>Portfolio Value ($)</b>", secondary_y=False)
fig.update_yaxes(
    title_text="<b>Monthly Deposit / Withdrawals ($)</b>",
    secondary_y=True,
    showgrid=False,
    layer="below traces",
)

fig.show()

In [14]:
fig_income_comp = go.Figure()

fig_income_comp.add_trace(
    go.Bar(
        x=monthly_income.index,
        y=monthly_income,
        name="My Portfolio Income",
        marker_color="mediumseagreen",
    )
)

fig_income_comp.add_trace(
    go.Bar(
        x=monthly_benchmark_income.index,
        y=monthly_benchmark_income,
        name="VOO Benchmark Income",
        marker_color="grey",
    )
)

fig_income_comp.update_layout(
    title_text="<b>Monthly Income Comparison: My Portfolio vs. VOO</b>",
    template="plotly_white",
    barmode="group", 
    xaxis_title="Date",
    yaxis_title="Net Income ($)",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig_income_comp.show()

In [15]:
pd.DataFrame(holdings["Ticker Info"])

Unnamed: 0,ADBE,AMZN,APO,ARKK,ARM,BA,BKNG,BRK-B,BRW,BTC-USD,...,UAE,USRT,VBK,VHT,VOO,VOOG,VXUS,XLF,XLK,XLV
Industry,Software - Application,Internet Retail,Asset Management,,Semiconductors,Aerospace & Defense,Travel Services,Insurance - Diversified,Asset Management,,...,,,,,,,,,,
Sector,Technology,Consumer Cyclical,Financial Services,,Technology,Industrials,Consumer Cyclical,Financial Services,Financial Services,,...,,,,,,,,,,
