In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
from typing import Tuple, Optional

class MarketQueryEncoder:
    """
    Phase I: Context Encoder.
    Responsible for fetching, normalizing, and embedding the recent market window.
    """
    
    def __init__(self, ticker: str = "^GSPC", lookback_window: int = 60):
        """
        Args:
            ticker (str): The asset symbol (default S&P 500).
            lookback_window (int): Size of the window X_recent (e.g., 60 days).
        """
        self.ticker = ticker
        self.lookback_window = lookback_window
        self.data = None
        
    def fetch_data(self, period: str = "2y") -> pd.DataFrame:
        """
        Fetches raw OHLCV data. 
        In production, this would connect to a Bloomberg/reuters feed.
        """
        print(f"--- Fetching data for {self.ticker} ---")
        df = yf.download(self.ticker, period=period, progress=False)
        
        # Ensure flat column index if MultiIndex is returned
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)
            
        self.data = df
        return df

    def _compute_rsi(self, series: pd.Series, window: int = 14) -> pd.Series:
        """Helper: Calculates Relative Strength Index (Momentum)."""
        delta = series.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))

    def preprocess_features(self) -> pd.DataFrame:
        """
        Applies Log-Returns and Technical Features.
        """
        if self.data is None:
            raise ValueError("Data not loaded. Call fetch_data() first.")
        
        df = self.data.copy()
        
        # 1. Log Returns (Stationarity)
        df['log_ret'] = np.log(df['Close'] / df['Close'].shift(1))
        
        # 2. Realized Volatility (20-day rolling std dev)
        df['volatility'] = df['log_ret'].rolling(window=20).std()
        
        # 3. Relative Volume (Volume / 20-day SMA Volume)
        df['rel_vol'] = df['Volume'] / df['Volume'].rolling(window=20).mean()
        
        # 4. RSI (Momentum)
        df['rsi'] = self._compute_rsi(df['Close'])
        
        # Drop NaN values generated by rolling windows
        df.dropna(inplace=True)
        
        return df

    def get_query_vector(self, current_idx: int = -1) -> Tuple[np.ndarray, pd.DataFrame]:
        """
        Extracts the specific window ending at current_idx, normalizes it (Z-Score),
        and returns the matrix ready for embedding/retrieval.
        
        Args:
            current_idx (int): The index in the dataframe to treat as "Today". 
                               Defaults to -1 (most recent).
        
        Returns:
            normalized_window (np.ndarray): The processed X_recent matrix.
            raw_window (pd.DataFrame): The original data for reference/plotting.
        """
        df = self.preprocess_features()
        
        # Slicing the window: [t - lookback : t]
        # We handle negative indexing carefully
        if current_idx == -1:
            window = df.iloc[-self.lookback_window:]
        else:
            window = df.iloc[current_idx - self.lookback_window : current_idx]
            
        if len(window) < self.lookback_window:
            raise ValueError("Insufficient data for the requested lookback window.")

        # --- CRITICAL: Z-Score Normalization (Regime Agnostic) ---
        # We normalize specific features relative to THIS window's statistics.
        # This allows us to match the 'shape' of a crash in 2020 with a crash in 2008,
        # even if the absolute volatility levels were different.
        
        features_to_normalize = ['log_ret', 'volatility', 'rel_vol', 'rsi']
        window_norm = window.copy()
        
        for col in features_to_normalize:
            mu = window[col].mean()
            sigma = window[col].std()
            # Avoid division by zero
            if sigma == 0: sigma = 1e-6
            
            window_norm[col] = (window[col] - mu) / sigma
            
        # Select only the feature columns for the vector
        final_vector_matrix = window_norm[features_to_normalize].values
        
        return final_vector_matrix, window

# --- Execution Example ---
if __name__ == "__main__":
    # Initialize the encoder for S&P 500
    encoder = MarketQueryEncoder(ticker="^GSPC", lookback_window=30)
    
    # 1. Fetch Data
    encoder.fetch_data()
    
    # 2. Create the Query Vector (X_recent)
    query_matrix, query_df = encoder.get_query_vector()
    
    print("Shape of Query Matrix:", query_matrix.shape)
    print("Sample Normalized Data (Last 5 days):\n", query_matrix[-5:])
    
    print("\n[Interpretation]:")
    print(f"We have extracted a ({query_matrix.shape[0]} x {query_matrix.shape[1]}) matrix.")
    print("This matrix represents the 'Micro-Regime' of the last 30 days.")
    print("The values are Z-Scores. A value of +2.0 in 'log_ret' means a 2-sigma up-move relative to recent history.")

--- Fetching data for ^GSPC ---


  df = yf.download(self.ticker, period=period, progress=False)


Shape of Query Matrix: (30, 4)
Sample Normalized Data (Last 5 days):
 [[-0.7098273   0.97688733 -0.77632639 -0.29911914]
 [ 0.21443749  0.98974582 -0.62485958 -0.28456863]
 [ 0.27625537  0.45298574 -0.31135224 -0.19569334]
 [ 0.05139549  0.41474265 -0.01677041  0.56568379]
 [ 0.15281972 -0.17449446  0.18484623  0.65770586]]

[Interpretation]:
We have extracted a (30 x 4) matrix.
This matrix represents the 'Micro-Regime' of the last 30 days.
The values are Z-Scores. A value of +2.0 in 'log_ret' means a 2-sigma up-move relative to recent history.
