# Uncorrelated Basket Finder (SQLite + Jupyter) v2
This notebook calls the project .py modules to build a SQLite DB, ingest prices/metadata, compute correlations, and search for a low-correlation basket.


In [1]:
import sys, os
sys.path.append('.')

from uncorr_basket.config import Config
from uncorr_basket.pipeline import run_pipeline
from uncorr_basket.logging_utils import log
import pandas as pd


In [2]:
# --- Configuration ---
cfg = Config(
    offline_mode=True,
    allow_network=False,
    db_path='assets.sqlite',
    lookback_days=252,
    basket_size=30,
    candidate_pool_size=120,  # increase for better search (will download more symbols)
    n_seeds=25,
    n_iterations=2500,
    soft_min_market_cap_usd=100e6,
    soft_min_dollar_volume_usd=1e6,
    lambda_market_cap=0.10,
    lambda_dollar_vol=0.10,
    lambda_sector_concentration=0.05,
    random_seed=42
).ensure_paths()
cfg


Config(db_path='assets.sqlite', data_dir='data_cache', cache_dir='data_cache', offline_mode=True, allow_network=False, include_etfs=True, include_equities=True, exclude_leveraged_inverse_etfs=True, soft_min_market_cap_usd=100000000.0, soft_min_dollar_volume_usd=1000000.0, lookback_days=252, min_coverage=0.95, return_type='log_adj', corr_method='pearson', freshness_days=5, basket_size=30, candidate_pool_size=120, n_seeds=25, n_iterations=2500, top_percentile=0.01, lambda_market_cap=0.1, lambda_dollar_vol=0.1, lambda_sector_concentration=0.05, yf_batch_size=25, yf_batch_sleep_s=5.0, yf_max_retries=6, yf_min_delay_s=0.3, random_seed=42)

In [3]:
# --- Optional: Provide your own symbols list ---
# If you leave this as None, the pipeline uses a default starter list.
seed_symbols = None
# Example:
seed_symbols = [
    # US - Tech
    "AAPL","MSFT","GOOGL","AMZN","META","NVDA","TSLA","ADBE",

    # US - Semis / Hardware
    "AVGO","AMD","QCOM","TXN","AMAT",

    # US - Financials
    "BRK-B","JPM","BAC","WFC","GS","AXP","BLK",

    # US - Healthcare
    "JNJ","LLY","UNH","ABBV","MRK","PFE","TMO","ISRG",

    # US - Consumer Staples
    "PG","KO","PEP","COST","WMT","PM",

    # US - Consumer Discretionary / Media
    "HD","MCD","DIS","NFLX","NKE",

    # US - Industrials
    "CAT","HON","UNP","LMT","RTX","FDX",

    # US - Energy
    "XOM","CVX","COP","SLB","EOG",

    # US - Utilities / REIT
    "NEE","AMT",

    # Canada
    "SHOP","TD","RY","ENB","SU","CNI",

    # UK
    "SHEL","BP","UL","AZN",

    # Expanded US Tech / Software
    "ORCL","IBM","CRM","INTC","CSCO",

    # Expanded Financials / Exchanges
    "C","MS","SCHW","CME","ICE",

    # Expanded Healthcare
    "GILD","AMGN","BMY","MDT",

    # Consumer / Retail / Travel
    "SBUX","LOW","TGT","BKNG",

    # Industrials / Capital Goods
    "BA","GE","DE","MMM",

    # Energy Refiners
    "PSX","MPC",

    # Materials / Chemicals
    "LIN","APD","DOW",

    # REITs
    "PLD","SPG",

    # Autos
    "GM","F",

    # Telecom
    "VZ","T",

    # Payments
    "MA","V",

    # Defense / Aerospace
    "NOC",

    # FinTech / Software
    "INTU",

    # E-commerce
    "EBAY"
]


In [4]:
result = run_pipeline(cfg, seed_symbols=seed_symbols)
log(f"DB: {result['db_path']} | snapshot_id={result['snapshot_id']} | run_id={result['run_id']}")
log(f"Best score: {result['best_score']:.6f}")
result['best_symbols']


[2025-12-30 09:43:57] Connecting DB...
[2025-12-30 09:43:57] Offline metadata: 0 / 100 symbols had cached meta.
[2025-12-30 09:43:57] Universe size after ranking/filtering: 100
[2025-12-30 09:43:57] Prices fresh in DB: 0 | need update/import: 100
[2025-12-30 09:43:57] Offline mode: importing from local cache CSVs only for missing/stale symbols...
[2025-12-30 09:43:57] These symbols will be dropped from the usable universe unless DB already contains enough history.
[2025-12-30 09:43:57] Loading prices from DB for returns alignment...
[2025-12-30 09:43:57] Aligned returns shape: (0, 0) (symbols: 0)


RuntimeError: Not enough symbols with sufficient data to build basket of 30. Tip: add cached CSVs under data_cache/prices/ or set allow_network=True.

In [None]:
# Basket summary: mean abs correlation contribution by asset
result['summary_df'].head(30)


In [None]:
# Top candidates leaderboard (best 1% cutoff depends on n_seeds)
import pandas as pd
cand = pd.DataFrame([
    {'basket_id': c['basket_id'], 'score': c['score'], 'base_corr': c['aux'].get('base_corr'),
     'cap_pen': c['aux'].get('cap_pen'), 'dv_pen': c['aux'].get('dv_pen'), 'sector_pen': c['aux'].get('sector_pen')}
    for c in result['candidates']
]).sort_values('score')
cand.head(10)


In [None]:
# Display the best basket correlation heatmap (matplotlib)
import matplotlib.pyplot as plt
import numpy as np

best = result['best_symbols']
sub = result['corr'].loc[best, best]
plt.figure(figsize=(10,8))
plt.imshow(sub.values)
plt.colorbar()
plt.title('Correlation (best basket)')
plt.xticks(range(len(best)), best, rotation=90, fontsize=7)
plt.yticks(range(len(best)), best, fontsize=7)
plt.tight_layout()
plt.show()
