In [141]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import numpy as np
from numpy.linalg import LinAlgError
from sklearn.preprocessing import StandardScaler
from typing import Dict, Optional, Tuple, List
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, root_mean_squared_error, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
import torch
import torch.nn as nn
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
from tqdm import tqdm
!pip install nbformat 
import warnings
warnings.filterwarnings('ignore')




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [88]:
class BitcoinDataProcessor:
    def __init__(self, n_jobs: int = 5):
        self.chainlet_scaler = StandardScaler()
        self.price_scaler = StandardScaler()
        self.n_jobs = n_jobs
   
    def load_price_data(self, price_path: str) -> pd.DataFrame:
        """Load price data from text file"""
        try:
            with open(price_path, 'r') as file:
                lines = file.readlines()
                
                headers = lines[0].strip().split(',')
                data = []
                for line in lines[1:]:
                    if line.strip():  # Skip empty lines
                        values = line.strip().split(',')
                        data.append(values)
                
            df = pd.DataFrame(data, columns=headers)
            
            df['date'] = pd.to_datetime(df['date'])
            df['price'] = df['price'].astype(float)
            df['totaltx'] = df['totaltx'].astype(float)
            
            df.set_index('date', inplace=True)
            df = df[['price', 'totaltx']]
            
            print(f"Loaded price data: {df.shape[0]} days")
            return df
            
        except Exception as e:
            raise ValueError(f"Error loading price data: {str(e)}")
       
    def process_chainlet_data(self, chainlet_path: str, binary_mode: bool = False) -> pd.DataFrame:
        """
        Process chainlet data from tab-delimited text file
        
        Args:
            chainlet_path: Path to chainlet data file
            binary_mode: If True, convert chainlet values to binary (0/1), otherwise keep counts
        """
        try:
            with open(chainlet_path, 'r') as file:
                lines = file.readlines()
                headers = lines[0].strip().split('\t')
                data = []
                for line in lines[1:]:
                    if line.strip():
                        values = line.strip().split('\t')
                        data.append(values)
                
            df = pd.DataFrame(data, columns=headers)
            numeric_cols = df.columns[df.columns != 'date']
            df[numeric_cols] = df[numeric_cols].astype(float)
            
            df['date'] = pd.to_datetime(
                df.apply(lambda row: f"{int(row['year'])}-{int(row['day']):03d}", axis=1),
                format='%Y-%j'
            )
            df.set_index('date', inplace=True)
            
            chainlet_cols = [col for col in df.columns if ':' in col]
            chainlet_features = df[chainlet_cols].copy()
            
            new_cols = {col: f'C_{col.replace(":", "_")}' for col in chainlet_cols}
            chainlet_features.rename(columns=new_cols, inplace=True)
            
            if binary_mode:
                chainlet_features = (chainlet_features > 0).astype(int)
            
            print(f"Loaded chainlet data: {chainlet_features.shape[0]} days, {len(chainlet_cols)} chainlet features")
            print(f"Mode: {'Binary (0/1)' if binary_mode else 'Counts'}")
            return chainlet_features
            
        except Exception as e:
            raise ValueError(f"Error processing chainlet data: {str(e)}")

    def create_features(self, price_data: pd.DataFrame, chainlet_data: pd.DataFrame,
                       use_lagged_chainlets: bool = True, lookback: int = 3,
                       horizon: int = 1) -> pd.DataFrame:
        data = pd.merge(price_data, chainlet_data, left_index=True, right_index=True, how='inner')
        
        # Price returns for features
        price_log = np.log(data['price'])
        price_returns = price_log.diff()
        
        # Create lagged returns
        for i in range(1, lookback + 1):
            data[f'return_lag_{i}'] = price_returns.shift(i)
        
        # Scale chainlet features
        chainlet_cols = [col for col in data.columns if col.startswith('C_')]
        if chainlet_cols:
            data[chainlet_cols] = self.chainlet_scaler.fit_transform(data[chainlet_cols])
        
        if use_lagged_chainlets:
            for col in chainlet_cols:
                data[f"{col}_lag1"] = data[col].shift(1)
            data = data.drop(columns=chainlet_cols)
            data = data.rename(columns={f"{col}_lag1": col for col in chainlet_cols})
        
        # Target: return from t+h-1 to t+h
        next_price = price_log.shift(-horizon)
        prior_price = price_log.shift(-(horizon-1))
        future_return = next_price - prior_price
        data['target'] = (future_return > 0).astype(int)
        
        return data.dropna()
   
    def align_and_prepare_data(self,
                            price_data: pd.DataFrame,
                            chainlet_data: pd.DataFrame,
                            use_lagged_chainlets: bool = True,
                            horizon: int = 1) -> pd.DataFrame:
        try:
            print("\nAligning data...")
            print(f"Price data dates: {price_data.index.min()} to {price_data.index.max()}")
            print(f"Chainlet data dates: {chainlet_data.index.min()} to {chainlet_data.index.max()}")
            data = self.create_features(
                price_data, 
                chainlet_data, 
                use_lagged_chainlets,
                lookback=3,
                horizon=horizon
            )
            
            print(f"\nFinal dataset shape: {data.shape}")
            print(f"Date range: {data.index.min()} to {data.index.max()}")
            
            return data
            
        except Exception as e:
            raise ValueError(f"Error aligning data: {str(e)}")

In [89]:
class Visualizer:
    def plot_performance(self, results: pd.DataFrame) -> go.Figure:
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x=results['date'],
                y=(results['strategy_return'] - 1) * 100,
                name='Strategy',
                line=dict(color='blue')
            )
        )
        
        fig.add_trace(
            go.Scatter(
                x=results['date'],
                y=(results['buy_hold_return'] - 1) * 100,
                name='Buy & Hold',
                line=dict(color='gray', dash='dash')
            )
        )
        
        fig.update_layout(
            title='Strategy vs Buy & Hold Returns (%)',
            xaxis_title='Date',
            yaxis_title='Return (%)',
            template='plotly_white',
            hovermode='x unified'
        )
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        fig.write_html(f'strategy_performance_{timestamp}.html')
        
        return fig

    def plot_confusion_matrix(self, y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        
        # Create text annotations for the cells
        annotations = []
        for i in range(len(cm)):
            for j in range(len(cm[i])):
                annotations.append(
                    dict(
                        x=j,
                        y=i,
                        text=str(cm[i, j]),
                        showarrow=False,
                        font=dict(
                            color='white' if cm[i, j] > cm.mean() else 'black',
                            size=16
                        ),
                    )
                )
        
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=['Predicted 0', 'Predicted 1'],
            y=['Actual 0', 'Actual 1'],
            colorscale='Blues'
        ))
        
        fig.update_layout(
            title='Confusion Matrix',
            xaxis_title='Predicted',
            yaxis_title='Actual',
            xaxis_side='top',
            annotations=annotations
        )
        return fig

    def plot_roc_curve(self, y_true, y_score):
        fpr, tpr, _ = roc_curve(y_true, y_score)
        roc_auc = auc(fpr, tpr)

        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=fpr, y=tpr,
            name='ROC Curve (AUC = %0.2f)' % roc_auc,
            mode='lines',
            line=dict(color='blue')
        ))
        fig.add_trace(go.Scatter(
            x=[0, 1], y=[0, 1],
            name='Random Classifier',
            mode='lines',
            line=dict(color='red', dash='dash')
        ))
        fig.update_layout(
            title='ROC Curve',
            xaxis_title='False Positive Rate (1 - Specificity)',
            yaxis_title='True Positive Rate (Sensitivity)',
            xaxis_range=[0, 1],
            yaxis_range=[0, 1]
        )
        return fig
    
    def plot_confidence_accuracy(self, y_true, y_score):
        """Plot histogram of prediction confidence with overlaid accuracy line."""
        bins = np.linspace(0, 1, 100)  # 10% bins
        bin_indices = np.digitize(y_score, bins) - 1
        
        accuracies = []
        counts = []
        bin_centers = (bins[:-1] + bins[1:]) / 2
        
        for i in range(len(bins) - 1):
            mask = bin_indices == i
            if np.sum(mask) > 0:
                accuracy = np.mean(y_true[mask] == (y_score[mask] > 0.5))
                accuracies.append(accuracy)
                counts.append(np.sum(mask))
            else:
                accuracies.append(np.nan)
                counts.append(0)
        
        fig = make_subplots(specs=[[{"secondary_y": True}]])
        
        fig.add_trace(
            go.Bar(
                x=bin_centers * 100,
                y=counts,
                name="Prediction Count",
                marker_color="lightblue",
                opacity=0.7
            ),
            secondary_y=False
        )
        
        fig.add_trace(
            go.Scatter(
                x=bin_centers * 100,
                y=np.array(accuracies) * 100,
                name="Accuracy",
                line=dict(color="red", width=2),
                mode="lines+markers"
            ),
            secondary_y=True
        )
        
        fig.update_layout(
            title="Prediction Confidence vs Accuracy",
            xaxis_title="Confidence (%)",
            barmode="overlay",
            template="plotly_white"
        )
        
        fig.update_yaxes(
            title_text="Number of Predictions",
            secondary_y=False
        )
        fig.update_yaxes(
            title_text="Accuracy (%)",
            secondary_y=True,
            range=[0, 100]
        )
        
        return fig

In [151]:
"""
===================================================================
ChainletBacktest.py
Description: Run backtests for the chainlet price-prediction trading algorithm
Pipeline: 

- ChainletBacktester class must be initialized with trading fees and choice of chainlets
    Selecting the entire 20x20 matrix will usually lead to poor outcomes
- Choice of model (RF or SVM) is tuned with respect to the base period data (l = 250)
- Following optimization, model is retrained either on simple rolling window or SPred approach
    (defaults set as l=250, w=5, h=1)
===================================================================
"""

# Helper function for Sharpe Ratio Calculation
def calculate_sharpe(returns: np.array, risk_free_rate: float = 0.02) -> float:
    """Calculate annualized Sharpe ratio"""
    if len(returns) == 0 or np.std(returns) == 0:
        return 0
    excess_returns = returns - risk_free_rate / 252
    return np.sqrt(252) * np.mean(excess_returns) / np.std(returns)

class LSTMPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(LSTMPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.input_dim = input_dim
        
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            batch_first=True
        )
        
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Ensure input is 3D: [batch_size, sequence_length, features]
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # Add sequence length dimension
            
        batch_size = x.size(0)
        
        # Initialize hidden state for each batch
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        
        # Forward propagate LSTM
        lstm_out, _ = self.lstm(x, (h0, c0))
        
        # Get the last time step
        last_time_step = lstm_out[:, -1, :]
        
        # Predict
        out = self.fc(last_time_step)
        predictions = self.sigmoid(out)
        
        return predictions

class ChainletLSTM(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64, 
                 num_layers: int = 2, dropout: float = 0.2, 
                 task: str = 'regression'):
        """
        Unified LSTM for both classification and regression tasks
        
        Args:
            input_dim: Number of input features
            hidden_dim: Size of LSTM hidden layers
            num_layers: Number of LSTM layers
            dropout: Dropout rate
            task: Either 'regression' or 'classification'
        """
        super().__init__()
        
        self.task = task
        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers,
            dropout=dropout,
            batch_first=True
        )
        self.linear = nn.Linear(hidden_dim, 1)
        
        # Add sigmoid for classification
        if task == 'classification':
            self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # Ensure input is 3D: (batch_size, sequence_length, features)
        if len(x.shape) == 2:
            x = x.unsqueeze(1)
            
        lstm_out, _ = self.lstm(x)
        output = self.linear(lstm_out[:, -1, :])
        
        if self.task == 'classification':
            return self.sigmoid(output)
        return output
    
    @staticmethod
    def train_model(model, X_train, y_train, epochs=100, lr=0.001):
        """
        Training utility method
        """
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = (nn.BCELoss() if model.task == 'classification' 
                    else nn.MSELoss())
        
        model.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            output = model(X_train)
            loss = criterion(output.squeeze(), y_train.squeeze())
            loss.backward()
            optimizer.step()
            
        return model
    
    @staticmethod
    def predict(model, X):
        """
        Prediction utility method
        """
        model.eval()
        with torch.no_grad():
            predictions = model(X).numpy()
            if model.task == 'classification':
                return (predictions > 0.5).astype(int)
            return predictions

class ChainletBacktester:
    def __init__(self, maker_fee: float = 0.0015, taker_fee: float = 0.0025):
        self.maker_fee = maker_fee
        self.taker_fee = taker_fee
        self.data_processor = BitcoinDataProcessor()
        self.visualizer = Visualizer()
        self.best_params = None
        self.chainlet_patterns = []  # Initialize without default patterns
        
        # Set up PyTorch device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

    def optimize_models(self, train_data: pd.DataFrame, is_binary: bool = True) -> Dict:
        """
        Optimize hyperparameters for RF, LSTM, and SVM models
        Args:
            train_data: Training data
            is_binary: Whether this is for binary prediction (True) or absolute price (False)
        Returns:
            Dictionary of optimized parameters for each model
        """
        feature_scaler = MinMaxScaler()
        X = feature_scaler.fit_transform(train_data[self.chainlet_patterns])
        
        if is_binary:
            y = train_data['target'].values
        else:
            price_scaler = MinMaxScaler()
            y = price_scaler.fit_transform(train_data[['price']]).ravel()

        # Initialize results dictionary
        best_params = {
            'random_forest': None,
            'lstm': None,
            'svm': None
        }

        # Random Forest Optimization
        print("\nOptimizing Random Forest parameters...")
        param_grid_rf = {
            'n_estimators': [100, 200, 300, 500],
            'max_depth': [5, 10, 15, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }

        tscv = TimeSeriesSplit(n_splits=5)
        best_score_rf = -np.inf
        
        for n_estimators in param_grid_rf['n_estimators']:
            for max_depth in param_grid_rf['max_depth']:
                for min_samples_split in param_grid_rf['min_samples_split']:
                    for min_samples_leaf in param_grid_rf['min_samples_leaf']:
                        model = RandomForestRegressor(
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            min_samples_split=min_samples_split,
                            min_samples_leaf=min_samples_leaf,
                            random_state=42
                        )
                        
                        scores = []
                        for train_idx, val_idx in tscv.split(X):
                            X_train, X_val = X[train_idx], X[val_idx]
                            y_train, y_val = y[train_idx], y[val_idx]
                            
                            model.fit(X_train, y_train)
                            score = -mean_squared_error(y_val, model.predict(X_val))
                            scores.append(score)
                        
                        avg_score = np.mean(scores)
                        if avg_score > best_score_rf:
                            best_score_rf = avg_score
                            best_params['random_forest'] = {
                                'n_estimators': n_estimators,
                                'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_samples_leaf': min_samples_leaf
                            }

        # LSTM Optimization
        print("\nOptimizing LSTM parameters...")
        param_grid_lstm = {
            'hidden_dim': [32, 64, 128],
            'num_layers': [1, 2],
            'dropout': [0.1, 0.2, 0.3],
            'learning_rate': [0.001, 0.01]
        }

        best_score_lstm = -np.inf
        
        for hidden_dim in param_grid_lstm['hidden_dim']:
            for num_layers in param_grid_lstm['num_layers']:
                for dropout in param_grid_lstm['dropout']:
                    for lr in param_grid_lstm['learning_rate']:
                        scores = []
                        for train_idx, val_idx in tscv.split(X):
                            X_train, X_val = X[train_idx], X[val_idx]
                            y_train, y_val = y[train_idx], y[val_idx]
                            
                            model = LSTMPredictor(
                                input_dim=X.shape[1],
                                hidden_dim=hidden_dim,
                                num_layers=num_layers,
                                dropout=dropout
                            )
                            
                            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
                            criterion = nn.BCELoss() if is_binary else nn.MSELoss()
                            
                            # Quick training to evaluate parameters
                            X_tensor = torch.FloatTensor(X_train)
                            y_tensor = torch.FloatTensor(y_train)
                            
                            for epoch in range(20):
                                model.train()
                                optimizer.zero_grad()
                                output = model(X_tensor)
                                loss = criterion(output.squeeze(), y_tensor.squeeze())
                                loss.backward()
                                optimizer.step()
                            
                            # Evaluate
                            model.eval()
                            with torch.no_grad():
                                X_val_tensor = torch.FloatTensor(X_val)
                                val_pred = model(X_val_tensor).numpy()
                                score = -mean_squared_error(y_val, val_pred.squeeze())
                                scores.append(score)
                        
                        avg_score = np.mean(scores)
                        if avg_score > best_score_lstm:
                            best_score_lstm = avg_score
                            best_params['lstm'] = {
                                'hidden_dim': hidden_dim,
                                'num_layers': num_layers,
                                'dropout': dropout,
                                'learning_rate': lr
                            }

        # SVM Optimization
        print("\nOptimizing SVM parameters...")
        param_grid_svm = {
            'C': [0.1, 1.0, 10.0],
            'gamma': ['scale', 'auto'],
            'kernel': ['rbf', 'linear']
        }

        best_score_svm = -np.inf
        
        for C in param_grid_svm['C']:
            for gamma in param_grid_svm['gamma']:
                for kernel in param_grid_svm['kernel']:
                    scores = []
                    for train_idx, val_idx in tscv.split(X):
                        X_train, X_val = X[train_idx], X[val_idx]
                        y_train, y_val = y[train_idx], y[val_idx]
                        
                        model = SVC(
                            C=C,
                            gamma=gamma,
                            kernel=kernel,
                            probability=True,
                            random_state=42
                        )
                        
                        model.fit(X_train, y_train)
                        score = accuracy_score(y_val, model.predict(X_val))
                        scores.append(score)
                    
                    avg_score = np.mean(scores)
                    if avg_score > best_score_svm:
                        best_score_svm = avg_score
                        best_params['svm'] = {
                            'C': C,
                            'gamma': gamma,
                            'kernel': kernel
                        }

        print("\nOptimization Results:")
        print(f"Random Forest: {best_params['random_forest']}")
        print(f"LSTM: {best_params['lstm']}")
        print(f"SVM: {best_params['svm']}")
        
        return best_params
    
    def spred_backtest(self, data: pd.DataFrame, model_type: str, 
                            training_length: int = 250, window: int = 5, horizon: int = 1,
                            optimize: bool = True, pbar: Optional[tqdm] = None) -> Dict:
        """
        Rolling window backtest for binary price movement prediction.
        
        Args:
            data: DataFrame containing price, chainlet and target data
            model_type: Type of model to use ('random_forest', 'lstm', or 'svm')
            training_length: Number of days to use for training
            window: Number of days to use for prediction from end of training window
            horizon: Number of days ahead to predict
            optimize: Whether to optimize model parameters on initial training data
            pbar: Optional progress bar
            
        Returns:
            Dictionary containing results and performance metrics
        """
        if len(data) < training_length + horizon:
            raise ValueError(f"Insufficient data: Need at least {training_length + horizon} days")

        # Initialize tracking variables
        portfolio_value = 1.0
        in_position = False
        results = []
        y_true = []
        y_pred = []
        
        # Initialize scalers
        feature_scaler = MinMaxScaler()
        
        # Get optimized parameters if requested
        if optimize:
            print("\nOptimizing model parameters...")
            initial_train_data = data.iloc[:training_length]
            best_params = self.optimize_models(initial_train_data, is_binary=True)
        else:
            best_params = None

        # Main backtesting loop
        for i in range(training_length, len(data) - horizon):
            # Get training data
            train_start = i - training_length
            train_end = i
            train_data = data.iloc[train_start:train_end]
            
            # Scale features
            X_train = train_data[self.chainlet_patterns]
            feature_scaler.fit(X_train)
            X_train_scaled = feature_scaler.transform(X_train)
            
            # Get binary target (price movement direction)
            y_train = train_data['target'].values.astype(int)
            
            # Get prediction window data (last w days)
            predict_start = i - window + 1
            predict_end = i + 1
            X_predict = data.iloc[predict_start:predict_end][self.chainlet_patterns]
            X_predict_scaled = feature_scaler.transform(X_predict)

            if model_type == 'random_forest':
                if best_params:
                    model = RandomForestClassifier(**best_params['random_forest'], random_state=42)
                else:
                    model = RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42)
                
                model.fit(X_train_scaled, y_train)
                prediction = model.predict(X_predict_scaled)[-1]
                prediction = 1 if prediction > 0.5 else 0
                
            elif model_type == 'lstm':
                if best_params:
                    lstm_params = best_params['lstm']
                    model = LSTMPredictor(
                        input_dim=len(self.chainlet_patterns),
                        hidden_dim=lstm_params['hidden_dim'],
                        num_layers=lstm_params['num_layers'],
                        dropout=lstm_params['dropout']
                    )
                    learning_rate = lstm_params['learning_rate']
                else:
                    model = LSTMPredictor(
                        input_dim=len(self.chainlet_patterns),
                        hidden_dim=32,
                        num_layers=1,
                        dropout=0.1
                    )
                    learning_rate = 0.001
                
                # Train LSTM
                X_tensor = torch.FloatTensor(X_train_scaled)
                y_tensor = torch.FloatTensor(y_train)
                
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
                criterion = nn.BCELoss()
                
                model.train()
                for epoch in range(50):
                    optimizer.zero_grad()
                    output = model(X_tensor)
                    loss = criterion(output.squeeze(), y_tensor.squeeze())
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                
                # Make prediction using last w days
                X_predict_tensor = torch.FloatTensor(X_predict_scaled).unsqueeze(0)
                
                model.eval()
                with torch.no_grad():
                    prediction = model(X_predict_tensor).numpy()[0][0]
                    prediction = 1 if prediction > 0.5 else 0
                    
            elif model_type == 'svm':
                if best_params:
                    model = SVC(**best_params['svm'], probability=True, random_state=42)
                else:
                    model = SVC(C=1.0, kernel='rbf', probability=True, random_state=42)
                
                model.fit(X_train_scaled, y_train)
                prediction = model.predict(X_predict_scaled)[-1]
            else:
                raise ValueError("Unsupported model type.")

            # Record predictions
            y_pred.append(prediction)
            actual = data.iloc[i + horizon]['target']
            y_true.append(actual)

            current_price = data.iloc[i]['price']
            next_price = data.iloc[i + horizon]['price']
            daily_return = (next_price / current_price) - 1

            if prediction == 1:  # Want to be long
                if not in_position:  # Need to enter
                    # Apply entry fee first
                    portfolio_value *= (1 - self.taker_fee)
                    # Then apply return
                    portfolio_value *= (1 + daily_return)
                    in_position = True
                else:  # Already in position
                    portfolio_value *= (1 + daily_return)
            else:  # Want to be out
                if in_position:  # Need to exit
                    # Apply return first since we still hold position
                    portfolio_value *= (1 + daily_return)
                    # Then apply exit fee
                    portfolio_value *= (1 - self.maker_fee)
                    in_position = False


            # Record results
            results.append({
                'date': data.index[i + horizon],
                'price': next_price,
                'prediction': prediction,
                'actual_target': actual,
                'in_position': in_position,
                'portfolio_value': portfolio_value,
                'strategy_return': portfolio_value - 1,
                'buy_hold_return': (next_price / data.iloc[training_length]['price']) - 1
            })

            if pbar:
                pbar.update(1)

        # Prepare results
        results_df = pd.DataFrame(results)
        valid_returns = [r['strategy_return'] for r in results if r['in_position']]

        return {
            'results': results_df,
            'total_return': (portfolio_value - 1) * 100,
            'buy_hold_return': results_df.iloc[-1]['buy_hold_return'] * 100,
            'win_rate': np.mean([1 if p == a else 0 for p, a in zip(y_pred, y_true)]) * 100,
            'trading_sharpe': calculate_sharpe(np.array(valid_returns)),
            'performance_plot': self.visualizer.plot_performance(results_df),
            'confusion_matrix': self.visualizer.plot_confusion_matrix(y_true, y_pred),
            'roc_curve': self.visualizer.plot_roc_curve(y_true, [1 if p == 1 else 0 for p in y_pred])
        }
                
    def absolute_price_backtest(self, data: pd.DataFrame, 
                            training_length: int = 250, window: int = 5, horizon: int = 1,
                            pbar: Optional[tqdm] = None) -> Dict:
        """
        Rolling window backtest for absolute price prediction using all models
        """
        if len(data) < training_length + horizon:
            raise ValueError(f"Insufficient data: Need at least {training_length + horizon} days")
        
        # Initialize models
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
            'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
            'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
            'LSTM': ChainletLSTM(
                input_dim=len(self.chainlet_patterns),
                hidden_dim=64,
                num_layers=2,
                dropout=0.2,
                task='regression'
            )
        }
        
        # Initialize results storage
        model_results = {name: [] for name in models.keys()}
        
        # Initialize scalers
        price_scaler = MinMaxScaler()
        feature_scaler = MinMaxScaler()
        
        # Main backtest loop
        for i in range(training_length, len(data) - horizon):
            # Get training data
            train_start = i - training_length
            train_end = i
            train_data = data.iloc[train_start:train_end]
            
            # Scale data
            train_prices = train_data['price'].values.reshape(-1, 1)
            price_scaler.fit(train_prices)
            train_prices_scaled = price_scaler.transform(train_prices)
            
            X_train = train_data[self.chainlet_patterns]
            feature_scaler.fit(X_train)
            X_train_scaled = feature_scaler.transform(X_train)
            
            # Get prediction window data
            predict_start = i - window + 1
            predict_end = i + 1
            X_predict = data.iloc[predict_start:predict_end][self.chainlet_patterns]
            X_predict_scaled = feature_scaler.transform(X_predict)
            
            # Get actual price h days ahead
            target_price = data.iloc[i + horizon]['price']
            prediction_date = data.index[i + horizon]
            
            # Make predictions with each model
            for name, model in models.items():
                try:
                    if name == 'LSTM':
                        # Convert data to tensors
                        X_train_tensor = torch.FloatTensor(X_train_scaled)
                        y_train_tensor = torch.FloatTensor(train_prices_scaled)
                        X_predict_tensor = torch.FloatTensor(X_predict_scaled)
                        
                        # Train LSTM
                        model = ChainletLSTM.train_model(
                            model=model,
                            X_train=X_train_tensor,
                            y_train=y_train_tensor,
                            epochs=50,
                            lr=0.001
                        )
                        
                        # Predict
                        prediction_scaled = ChainletLSTM.predict(model, X_predict_tensor)[-1]
                        prediction_scaled = float(prediction_scaled)  # Convert to scalar
                        
                    else:
                        # Train and predict with sklearn models
                        model.fit(X_train_scaled, train_prices_scaled.ravel())
                        prediction_scaled = model.predict(X_predict_scaled)[-1]
                    
                    # Ensure prediction is 2D array for inverse transform
                    prediction = price_scaler.inverse_transform([[prediction_scaled]])[0][0]
                    
                except Exception as e:
                    print(f"Error in {name} prediction: {str(e)}")
                    prediction = np.nan
                
                # Store results
                model_results[name].append({
                    'date': prediction_date,
                    'price': target_price,
                    'prediction': prediction
                })
        
        # Create results dictionary
        results = {}
        plots = {}
        metrics = {}
        
        # Process results for each model
        for name in list(models.keys()):
            results_df = pd.DataFrame(model_results[name])
            
            # Calculate metrics (excluding NaN values)
            valid_mask = ~np.isnan(results_df['prediction'])
            if valid_mask.any():
                rmse = root_mean_squared_error(
                    results_df.loc[valid_mask, 'price'], 
                    results_df.loc[valid_mask, 'prediction']
                )
                mae = mean_absolute_error(
                    results_df.loc[valid_mask, 'price'], 
                    results_df.loc[valid_mask, 'prediction']
                )
            else:
                rmse = np.nan
                mae = np.nan
            
            # Store results and metrics
            results[name] = results_df
            metrics[name] = {'RMSE': rmse, 'MAE': mae}
            plots[name] = self._create_price_prediction_plot(results_df, rmse, mae, name)
        
        return {
            'results': results,
            'metrics': metrics,
            'plots': plots
        }

    def _create_price_prediction_plot(self, results_df: pd.DataFrame, rmse: float, 
                                    mae: float, model_name: str) -> go.Figure:
        """Create visualization for price predictions"""
        fig = go.Figure()
        
        # Add actual prices
        fig.add_trace(go.Scatter(
            x=results_df['date'],
            y=results_df['price'],
            name='Actual Price',
            line=dict(color='blue', width=2)
        ))
        
        # Add predictions
        fig.add_trace(go.Scatter(
            x=results_df['date'],
            y=results_df['prediction'],
            name=f'Predicted Price (RMSE: ${rmse:.2f}, MAE: ${mae:.2f})',
            line=dict(color='red', width=2, dash='dot')
        ))
        
        fig.update_layout(
            title=f'Bitcoin Price Predictions - {model_name}',
            xaxis_title='Date',
            yaxis_title='Price (USD)',
            template='plotly_white',
            hovermode='x unified'
        )
        
        return fig

In [91]:
def run_backtest(config: dict, price_path: str, chainlet_path: str):
    """
    Runs the backtest using the provided configuration and data paths.

    Args:
        config: Dictionary containing configuration parameters.
            Example:
                {
                    'start_date': '2023-01-01',
                    'end_date': '2023-12-31',
                    'model_type': 'random_forest',
                    'training_length': 250,
                    'window': 5,
                    'horizon': 1,
                    'binary_mode': True,
                    'chainlets': ['C_1_7', 'C_6_1', 'C_3_3', ...]  # Chainlet patterns to use
                }
        price_path: Path to the price data CSV file.
        chainlet_path: Path to the chainlet data CSV file.

    Returns:
        Dict containing backtest results and performance visualizations.
    """
    backtester = ChainletBacktester()
    data_processor = backtester.data_processor

    # Load data
    print("Loading and processing data...")
    price_data = data_processor.load_price_data(price_path)
    chainlet_data = data_processor.process_chainlet_data(chainlet_path, binary_mode=config.get('binary_mode', False))

    # Align data
    data = data_processor.align_and_prepare_data(price_data, chainlet_data, horizon=config['horizon'])

    # Filter for date range
    start_date = config['start_date']
    end_date = config['end_date']
    if start_date not in data.index or end_date not in data.index:
        raise ValueError(f"Dates must be within available data range ({data.index.min()} to {data.index.max()}).")
    data = data[start_date:end_date]

    # Handle chainlet selection
    chainlet_patterns = config.get('chainlets', [])
    if chainlet_patterns == 'all':
        # Get all columns that start with 'C_'
        chainlet_patterns = [col for col in data.columns if col.startswith('C_')]
        print(f"Using all {len(chainlet_patterns)} chainlet patterns")
    elif not chainlet_patterns:
        raise ValueError("No chainlet patterns specified in configuration.")

    # Update backtester to use specified chainlet patterns
    backtester.chainlet_patterns = chainlet_patterns


    # Run backtest
    print("Running backtest...")
    results = backtester.spred_backtest(
        data=data,
        model_type=config['model_type'],
        training_length=config['training_length'],
        window=config['window'],
        horizon=config['horizon'],
        optimize=config.get('optimize', False)
    )

    # Save results for visualization
    results_df = results['results']
    performance_plot = results['performance_plot']
    confusion_matrix_plot = results['confusion_matrix']
    roc_curve_plot = results['roc_curve']

    print("Backtest complete.")

    return {
        'results_df': results_df,
        'performance_plot': performance_plot,
        'confusion_matrix_plot': confusion_matrix_plot,
        'roc_curve_plot': roc_curve_plot,
        'summary': {
            'total_return': results['total_return'],
            'buy_hold_return': results['buy_hold_return'],
            'win_rate': results['win_rate'],
            'trading_sharpe': results['trading_sharpe']
        }
    }

In [114]:
# Example configuration for Jupyter Notebook
if __name__ == "__main__":
    config = {
    'start_date': '2011-06-05',
    'end_date': '2016-01-01',
    'model_type': 'lstm',
    'training_length': 250,
    'window': 5,
    'horizon': 1,
    'binary_mode': True,
    'chainlets': ['C_1_7', 'C_6_1', 'C_3_3', 'C_20_2', 'C_20_3', 'C_20_12', 'C_20_17', 'C_1_1']
}

    price_path = "price_data.csv"
    chainlet_path = "chainlet_data.txt"

    results = run_backtest(config, price_path, chainlet_path)

    # Display results in Jupyter Notebook
    from IPython.display import display

    print("Performance Summary:")
    display(pd.DataFrame([results['summary']]))

    print("Results Table:")
    display(results['results_df'])

    print("Performance Visualization:")
    results['performance_plot'].show()

    print("Confusion Matrix:")
    results['confusion_matrix_plot'].show()

    print("ROC Curve:")
    results['roc_curve_plot'].show()


Using device: cpu
Loading and processing data...
Loaded price data: 2404 days
Loaded chainlet data: 3281 days, 400 chainlet features
Mode: Binary (0/1)

Aligning data...
Price data dates: 2011-06-01 00:00:00 to 2017-12-31 00:00:00
Chainlet data dates: 2009-01-03 00:00:00 to 2017-12-31 00:00:00

Final dataset shape: (2400, 406)
Date range: 2011-06-05 00:00:00 to 2017-12-31 00:00:00
Running backtest...
Backtest complete.
Performance Summary:


Unnamed: 0,total_return,buy_hold_return,win_rate,trading_sharpe
0,14854.50533,7251.269036,53.873239,15.555929


Results Table:


Unnamed: 0,date,price,prediction,actual_target,in_position,portfolio_value,strategy_return,buy_hold_return
0,2012-02-11,5.60,0,0.0,False,1.000000,0.000000,-0.052453
1,2012-02-12,5.51,0,0.0,False,1.000000,0.000000,-0.067682
2,2012-02-13,5.26,0,0.0,False,1.000000,0.000000,-0.109983
3,2012-02-14,4.46,0,0.0,False,1.000000,0.000000,-0.245347
4,2012-02-15,4.33,0,0.0,False,1.000000,0.000000,-0.267343
...,...,...,...,...,...,...,...,...
1415,2015-12-28,421.26,0,1.0,False,147.076183,146.076183,70.279188
1416,2015-12-29,432.62,1,0.0,True,150.664739,149.664739,72.201354
1417,2015-12-30,426.14,1,1.0,True,148.408007,147.408007,71.104907
1418,2015-12-31,430.05,0,1.0,False,149.545053,148.545053,71.766497


Performance Visualization:


Confusion Matrix:


ROC Curve:


0.5295774647887324

In [152]:
# Configuration
config = {
    'start_date': '2012-01-01',
    'end_date': '2015-01-01',
    'training_length': 250,
    'window': 5,
    'horizon': 1,
    'chainlets': ['C_1_7', 'C_6_1', 'C_3_3', 'C_20_2', 'C_20_3', 'C_20_12', 'C_20_17', 'C_1_1']
}

# Initialize backtester
backtester = ChainletBacktester()
data_processor = backtester.data_processor

# Load data
print("Loading and processing data...")
price_data = data_processor.load_price_data("price_data.csv")
chainlet_data = data_processor.process_chainlet_data("chainlet_data.txt", binary_mode=False)

# Align data
data = data_processor.align_and_prepare_data(price_data, chainlet_data, horizon=config['horizon'])
data = data[config['start_date']:config['end_date']]

# Update backtester chainlet patterns
backtester.chainlet_patterns = config['chainlets']

# Run backtest
results = backtester.absolute_price_backtest(
    data=data,
    training_length=config['training_length'],
    window=config['window'],
    horizon=config['horizon']
)

# Display results
print("\nModel Performance Metrics:")
for model_name, metrics in results['metrics'].items():
    print(f"\n{model_name}:")
    print(f"RMSE: ${metrics['RMSE']:.2f}")
    print(f"MAE: ${metrics['MAE']:.2f}")

# Display plots
for model_name, plot in results['plots'].items():
    print(f"\nShowing plot for {model_name}...")
    plot.show()

Using device: cpu
Loading and processing data...
Loaded price data: 2404 days
Loaded chainlet data: 3281 days, 400 chainlet features
Mode: Counts

Aligning data...
Price data dates: 2011-06-01 00:00:00 to 2017-12-31 00:00:00
Chainlet data dates: 2009-01-03 00:00:00 to 2017-12-31 00:00:00

Final dataset shape: (2400, 406)
Date range: 2011-06-05 00:00:00 to 2017-12-31 00:00:00

Model Performance Metrics:

Random Forest:
RMSE: $86.17
MAE: $51.72

SVR:
RMSE: $93.57
MAE: $56.40

XGBoost:
RMSE: $88.13
MAE: $52.75

LSTM:
RMSE: $89.17
MAE: $55.53

Showing plot for Random Forest...



Showing plot for SVR...



Showing plot for XGBoost...



Showing plot for LSTM...


In [145]:
class ChainletModelComparison:
    def __init__(self, backtester: ChainletBacktester):
        """
        Initialize model comparison class with a ChainletBacktester instance
        
        Args:
            backtester: ChainletBacktester instance for data processing and visualization
        """
        self.backtester = backtester
        self.visualizer = backtester.visualizer
        self.data_processor = backtester.data_processor
        
    def compare_ml_models(self, data: pd.DataFrame, training_length: int = 250, test_size: float = 0.2):
        """
        Compares multiple ML models using traditional train/test split on chainlet data
        
        Args:
            data: DataFrame containing price, chainlet and target data
            training_length: Number of days to use for training
            test_size: Fraction of data to use for testing
        """
        print("Running ML model comparison...")
        
        # Get all chainlet columns
        chainlet_cols = [col for col in data.columns if col.startswith('C_')]
        print(f"\nUsing {len(chainlet_cols)} chainlet features")
        
        # Prepare features and target
        X = data[chainlet_cols].copy()
        y = data['target'].astype(int)
        
        # Scale features
        scaler = MinMaxScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Split into train/test
        split_idx = int(len(data) * (1 - test_size))
        X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        
        # Convert data for LSTM
        X_train_lstm = torch.FloatTensor(X_train)
        y_train_lstm = torch.FloatTensor(y_train.values)
        X_test_lstm = torch.FloatTensor(X_test)
        y_test_lstm = torch.FloatTensor(y_test.values)
        
        # Initialize models
        models = {
            'SVM': SVC(probability=True, random_state=42),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
            'LSTM': LSTMPredictor(
                input_dim=len(chainlet_cols),
                hidden_dim=32,
                num_layers=1,
                dropout=0.1
            )
        }
        
        results = {}
        
        # Train and evaluate each model
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            if name == 'LSTM':
                # Train LSTM
                optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
                criterion = nn.BCELoss()
                
                model.train()
                for epoch in range(50):
                    optimizer.zero_grad()
                    output = model(X_train_lstm)
                    loss = criterion(output.squeeze(), y_train_lstm)
                    loss.backward()
                    optimizer.step()
                    
                # Evaluate LSTM
                model.eval()
                with torch.no_grad():
                    y_pred = model(X_test_lstm).numpy()
                    y_pred = (y_pred > 0.5).astype(int)
            else:
                # Train and evaluate other models
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            
            results[name] = {
                'accuracy': accuracy,
                'confusion_matrix': cm
            }
            
            # Print results
            print(f"\n{name} Results:")
            print(f"Accuracy: {accuracy:.4f}")
            print("\nConfusion Matrix:")
            print(cm)
            
            # Plot confusion matrix
            self.visualizer.plot_confusion_matrix(y_test, y_pred).show()
            
        return results
        
    def compare_price_predictions(self, data: pd.DataFrame, test_size: float = 0.2):
        """
        Compares multiple ML models for absolute price prediction
        """
        print("Running absolute price prediction model comparison...")
        
        # Get all chainlet columns for main models
        chainlet_cols = [col for col in data.columns if col.startswith('C_')]
        print(f"\nUsing {len(chainlet_cols)} chainlet features")
        
        # Prepare features and target
        X = data[chainlet_cols].copy()
        y = data['price'].values
        
        # Scale features
        feature_scaler = MinMaxScaler()
        X_scaled = feature_scaler.fit_transform(X)
        
        # Scale prices for training (we'll inverse transform for plotting)
        price_scaler = MinMaxScaler()
        y_scaled = price_scaler.fit_transform(y.reshape(-1, 1))
        
        # Train/test split
        split_idx = int(len(data) * (1 - test_size))
        X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
        y_train, y_test = y_scaled[:split_idx], y_scaled[split_idx:]
        dates_test = data.index[split_idx:]
        
        # Initialize predictions and metrics dictionaries
        predictions = {
            'Actual': price_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
        }
        metrics = {}
        
        # Initialize models
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=300, random_state=42),
            'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.1),
            'XGBoost': XGBRegressor(n_estimators=300, random_state=42),
            'LSTM': ChainletLSTM(
                input_dim=len(chainlet_cols),
                hidden_dim=64,
                num_layers=2,
                dropout=0.2,
                task='regression'
            )
        }
        
        # Convert data to tensors for LSTM
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.FloatTensor(y_train)
        X_test_tensor = torch.FloatTensor(X_test)
        
        # Train and predict with each model
        for name, model in models.items():
            print(f"\nTraining {name}...")
            
            if name == 'LSTM':
                model = ChainletLSTM.train_model(
                    model=model,
                    X_train=X_train_tensor,
                    y_train=y_train_tensor,
                    epochs=100,
                    lr=0.001
                )
                y_pred = ChainletLSTM.predict(model, X_test_tensor)
                
            else:
                model.fit(X_train, y_train.ravel())
                y_pred = model.predict(X_test).reshape(-1, 1)
            
            # Store inverse transformed predictions
            predictions[name] = price_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()
            
            # Calculate metrics
            mse = np.mean((predictions['Actual'] - predictions[name]) ** 2)
            rmse = np.sqrt(mse)
            mae = np.mean(np.abs(predictions['Actual'] - predictions[name]))
            metrics[name] = {
                'RMSE': rmse,
                'MAE': mae
            }
        
        # Prepare reduced feature set for Markov model
        print("\nTraining Markov Switching model...")
        try:
            # Prepare data for Markov model
            markov_features = ['C_1_7', 'C_6_1', 'C_3_3', 'C_20_2', 'C_20_12', 'C_20_17', 'C_1_1', 'price']
            X_markov = data[markov_features].copy()
            X_markov_scaled = StandardScaler().fit_transform(X_markov)
            X_markov_train = X_markov_scaled[:split_idx]
            X_markov_test = X_markov_scaled[split_idx:]
            
            # Prepare endogenous variable
            y_markov_train = y_train.ravel()
            
            # Fit Markov model
            markov_model = MarkovRegression(
                endog=y_markov_train,
                k_regimes=2,
                trend='c',
                exog=X_markov_train,
                switching_variance=False
            ).fit()
            
            # Predict
            markov_pred = markov_model.predict(exog=X_markov_test)
            predictions['Markov'] = price_scaler.inverse_transform(markov_pred.reshape(-1, 1)).flatten()
            
            # Calculate metrics for Markov
            mse = np.mean((predictions['Actual'] - predictions['Markov']) ** 2)
            rmse = np.sqrt(mse)
            mae = np.mean(np.abs(predictions['Actual'] - predictions['Markov']))
            metrics['Markov'] = {
                'RMSE': rmse,
                'MAE': mae
            }
            
        except (LinAlgError, ValueError, np.linalg.LinAlgError) as e:
            print(f"\nWarning: Markov model failed to converge: {str(e)}")
            print("Skipping Markov model predictions...")
            predictions['Markov'] = np.full_like(predictions['Actual'], np.nan)
            metrics['Markov'] = {
                'RMSE': np.nan,
                'MAE': np.nan
            }
        
        # Create visualization
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=dates_test,
                y=predictions['Actual'],
                name='Actual Price',
                line=dict(color='black', width=2)
            )
        )
        
        colors = ['blue', 'red', 'green', 'purple', 'orange']
        for (name, pred), color in zip(predictions.items(), colors):
            if name != 'Actual':
                fig.add_trace(
                    go.Scatter(
                        x=dates_test,
                        y=pred,
                        name=name,
                        line=dict(color=color)
                    )
                )
        
        fig.update_layout(
            title='Bitcoin Price Predictions Comparison',
            xaxis_title='Date',
            yaxis_title='Price (USD)',
            template='plotly_white',
            hovermode='x unified'
        )
        
        return {
            'predictions': predictions,
            'metrics': metrics,
            'plot': fig
        }

In [135]:
if __name__ == "__main__":
    # First initialize the backtester
    backtester = ChainletBacktester()
    
    # Initialize the model comparison class with the backtester
    model_comparison = ChainletModelComparison(backtester)
    
    # Load and process data using the backtester's data processor
    print("Loading data...")
    price_data = backtester.data_processor.load_price_data("price_data.csv")
    chainlet_data = backtester.data_processor.process_chainlet_data("chainlet_data.txt", binary_mode=False)
    
    # Align data
    data = backtester.data_processor.align_and_prepare_data(
        price_data=price_data,
        chainlet_data=chainlet_data,
        use_lagged_chainlets=True,
        horizon=1
    )
    
    # Set date range
    start_date = '2011-06-05'
    end_date = '2016-01-01'
    data = data[start_date:end_date]
    
    print("\n1. Running Binary Classification Model Comparison...")
    classification_results = model_comparison.compare_ml_models(
        data=data,
        training_length=250,
        test_size=0.2
    )
    
    # Display binary classification results
    print("\nBinary Classification Results:")
    for model_name, results in classification_results.items():
        print(f"\n{model_name}:")
        print(f"Accuracy: {results['accuracy']:.4f}")
        print("Confusion Matrix:")
        print(results['confusion_matrix'])
    
    print("\n2. Running Absolute Price Prediction Comparison...")
    price_results = model_comparison.compare_price_predictions(
        data=data,
        test_size=0.2
    )
    
    # Display price prediction results
    print("\nAbsolute Price Prediction Metrics:")
    for model_name, metrics in price_results['metrics'].items():
        print(f"\n{model_name}:")
        print(f"RMSE: ${metrics['RMSE']:.2f}")
        print(f"MAE: ${metrics['MAE']:.2f}")
    
    # Show price prediction plot
    price_results['plot'].show()
    
    print("\nAnalysis complete!")
    print("Note: Markov model using reduced feature set:")
    print("['C_1_7', 'C_6_1', 'C_3_3', 'C_20_2', 'C_20_12', 'C_20_17', 'C_1_1', 'price']")

Using device: cpu
Loading data...
Loaded price data: 2404 days
Loaded chainlet data: 3281 days, 400 chainlet features
Mode: Counts

Aligning data...
Price data dates: 2011-06-01 00:00:00 to 2017-12-31 00:00:00
Chainlet data dates: 2009-01-03 00:00:00 to 2017-12-31 00:00:00

Final dataset shape: (2400, 406)
Date range: 2011-06-05 00:00:00 to 2017-12-31 00:00:00

1. Running Binary Classification Model Comparison...
Running ML model comparison...

Using 400 chainlet features

Training SVM...

SVM Results:
Accuracy: 0.5015

Confusion Matrix:
[[141  22]
 [145  27]]



Training Random Forest...

Random Forest Results:
Accuracy: 0.5045

Confusion Matrix:
[[159   4]
 [162  10]]



Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.5552

Confusion Matrix:
[[ 53 110]
 [ 39 133]]



Training LSTM...

LSTM Results:
Accuracy: 0.5433

Confusion Matrix:
[[ 19 144]
 [  9 163]]



Binary Classification Results:

SVM:
Accuracy: 0.5015
Confusion Matrix:
[[141  22]
 [145  27]]

Random Forest:
Accuracy: 0.5045
Confusion Matrix:
[[159   4]
 [162  10]]

Logistic Regression:
Accuracy: 0.5552
Confusion Matrix:
[[ 53 110]
 [ 39 133]]

LSTM:
Accuracy: 0.5433
Confusion Matrix:
[[ 19 144]
 [  9 163]]

2. Running Absolute Price Prediction Comparison...
Running absolute price prediction model comparison...

Using 400 chainlet features

Training Random Forest...

Training SVR...

Training XGBoost...

Training LSTM...

Training Markov Switching model...

Skipping Markov model predictions...

Absolute Price Prediction Metrics:

Random Forest:
RMSE: $131.83
MAE: $108.90

SVR:
RMSE: $111.63
MAE: $91.67

XGBoost:
RMSE: $103.70
MAE: $80.00

LSTM:
RMSE: $152.10
MAE: $135.19

Markov:
RMSE: $nan
MAE: $nan



Analysis complete!
Note: Markov model using reduced feature set:
['C_1_7', 'C_6_1', 'C_3_3', 'C_20_2', 'C_20_12', 'C_20_17', 'C_1_1', 'price']
