In [10]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import os
from typing import Dict, Tuple, List
import math
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.graph_objects as go
from visualizer import Visualizer

In [11]:
class ChainletPreprocessor:
    def __init__(self, 
                 price_file_path: str,
                 occ_file_path: str = None,
                 amt_file_path: str = None,
                 data_frequency: str = 'daily',
                 start_year: int = 2024,
                 end_year: int = 2024,
                 batch_size: int = 100):
        """
        Initialize chainlet preprocessor
        
        Args:
            price_file_path: Path to price data
            occ_file_path: Path to occurrence matrix file
            amt_file_path: Path to amount matrix file (optional)
            data_frequency: 'daily' or 'hourly'
            start_year: Starting year for analysis (daily only)
            end_year: Ending year for analysis (daily only)
            batch_size: Size of sliding batch
        """
        self.price_file_path = price_file_path
        self.occ_file_path = occ_file_path
        self.amt_file_path = amt_file_path
        self.data_frequency = data_frequency
        self.start_year = start_year
        self.end_year = end_year
        self.batch_size = batch_size
        
        # Load price data and matrices on initialization
        self.price_data = self.load_price_data()
        self._daily_occ_data = None
        self._daily_amt_data = None
        self._hourly_matrices = None
        
        # Load matrices based on frequency
        if self.data_frequency == 'daily':
            if self.occ_file_path:
                self._daily_occ_data = self.load_daily_matrix_file(self.occ_file_path)
            if self.amt_file_path:
                self._daily_amt_data = self.load_daily_matrix_file(self.amt_file_path)
        else:  # hourly
            if self.occ_file_path:
                with open(self.occ_file_path, 'r') as f:
                    self._hourly_matrices = self.parse_hourly_matrices(f.read())

    def load_price_data(self) -> pd.DataFrame:
        if self.data_frequency == 'daily':
            # Keep existing daily code exactly as is
            df = pd.read_csv(self.price_file_path)
            df['date'] = pd.to_datetime(df['date'])
            return df.set_index('date')
        else:
            # Modified hourly price loading
            prices = []
            with open(self.price_file_path, 'r') as f:
                for line in f:
                    if line.strip():
                        timestamp, price = line.strip().split()
                        # Round timestamp to hour
                        hour_ts = int(timestamp) - (int(timestamp) % 3600)
                        prices.append({
                            'hour_timestamp': hour_ts,
                            'price': float(price),
                            'date': pd.to_datetime(hour_ts, unit='s')  # Keep date for compatibility
                        })
            df = pd.DataFrame(prices)
            # Create both indices for flexible lookup
            df = df.set_index('date')
            df['hour_timestamp'] = df['hour_timestamp']  # Keep hour timestamp as column
            return df

    def load_daily_matrix_file(self, filepath: str) -> pd.DataFrame:
        """Load and parse daily chainlet data file"""
        df = pd.read_csv(filepath, sep='\t')
        # Keep only year, day, totaltx and chainlet columns
        chainlet_cols = [col for col in df.columns if ':' in str(col)]
        required_cols = ['year', 'day', 'totaltx'] + chainlet_cols
        return df[required_cols]

    def construct_matrix_from_row(self, row) -> Tuple[np.ndarray, float]:
        """Convert tab-delimited chainlet row into matrix"""
        matrix = np.zeros((20, 20))
        total_tx = row['totaltx']
        
        for col in row.index:
            if ':' in str(col):
                i, j = map(int, col.split(':'))
                if i < 20 and j < 20:  # Ensure within bounds
                    matrix[i][j] = row[col]
        
        return matrix, total_tx

    def parse_hourly_matrices(self, content: str) -> Dict[int, Tuple[np.ndarray, np.ndarray]]:
        """
        Parse and aggregate hourly matrices, grouping by hour
        Returns dict mapping hour timestamp -> (aggregated_occ_matrix, aggregated_amt_matrix)
        """
        # Track matrices and transactions per hour
        hourly_matrices = {}  # hour_timestamp -> (occ_sum, amt_sum, tx_count)
        lines = content.strip().split('\n')
        i = 0
        
        while i < len(lines):
            if not lines[i].strip():
                i += 1
                continue
                
            # Parse timestamp and round down to hour
            parts = lines[i].strip().split()
            if len(parts) == 2:
                timestamp = int(parts[1])
                hour_timestamp = timestamp - (timestamp % 3600)  # Round down to hour
                i += 1
                
                occ_matrix = np.zeros((20, 20))
                amt_matrix = np.zeros((20, 20))
                
                # Parse occurrence matrix
                if i < len(lines) and 'tx_count_matrix' in lines[i]:
                    entries = lines[i].split('\t')[1:]
                    for entry in entries:
                        if ':' in entry:
                            coords, count = entry.split(':')
                            x, y = map(int, coords.split(','))
                            occ_matrix[x][y] = int(count)
                    i += 1
                    
                # Parse amount matrix
                if i < len(lines) and 'tx_weight_matrix' in lines[i]:
                    entries = lines[i].split('\t')[1:]
                    for entry in entries:
                        if ':' in entry:
                            coords, weight = entry.split(':')
                            x, y = map(int, coords.split(','))
                            amt_matrix[x][y] = float(weight)
                    i += 1
                    
                # Aggregate matrices by hour
                if hour_timestamp not in hourly_matrices:
                    hourly_matrices[hour_timestamp] = (occ_matrix, amt_matrix, 1)
                else:
                    prev_occ, prev_amt, count = hourly_matrices[hour_timestamp]
                    hourly_matrices[hour_timestamp] = (
                        prev_occ + occ_matrix,
                        prev_amt + amt_matrix,
                        count + 1
                    )
            else:
                i += 1

        # Normalize aggregated matrices by number of transactions per hour
        final_matrices = {}
        for hour_ts, (occ_sum, amt_sum, count) in hourly_matrices.items():
            final_matrices[hour_ts] = (occ_sum / count, amt_sum / count)
            
        return final_matrices

    def get_matrices_for_period(self, period, threshold: int = 0) -> Tuple[np.ndarray, np.ndarray]:
        """Get matrices for a specific period"""
        if self.data_frequency == 'daily':
            year = period.year
            day = period.dayofyear
            
            # Initialize matrices
            occ_matrix = np.zeros((20, 20))
            amt_matrix = np.zeros((20, 20))
            total_tx = 1.0  # Default to avoid division by zero
            
            # Get occurrence matrix
            if self._daily_occ_data is not None:
                row = self._daily_occ_data[
                    (self._daily_occ_data['year'] == year) & 
                    (self._daily_occ_data['day'] == day)
                ]
                if not row.empty:
                    occ_matrix, total_tx = self.construct_matrix_from_row(row.iloc[0])
            
            # Get amount matrix if available
            if self._daily_amt_data is not None:
                row = self._daily_amt_data[
                    (self._daily_amt_data['year'] == year) & 
                    (self._daily_amt_data['day'] == day)
                ]
                if not row.empty:
                    amt_matrix, _ = self.construct_matrix_from_row(row.iloc[0])
            
            # Apply threshold filtering
            if threshold > 0:
                mask = occ_matrix < threshold
                occ_matrix[mask] = 0
                amt_matrix[mask] = 0
            
            # Normalize by total transactions
            if total_tx > 0:
                occ_matrix = occ_matrix / total_tx
                amt_matrix = amt_matrix / total_tx
                
        else:  # hourly
            timestamp = int(period)
            if timestamp in self._hourly_matrices:
                occ_matrix, amt_matrix = self._hourly_matrices[timestamp]
                
                # Apply threshold filtering
                if threshold > 0:
                    mask = occ_matrix < threshold
                    occ_matrix[mask] = 0
                    amt_matrix[mask] = 0
                
                # Normalize by total transactions
                total_tx = occ_matrix.sum()
                if total_tx > 0:
                    occ_matrix = occ_matrix / total_tx
                    amt_matrix = amt_matrix / total_tx
            else:
                return np.zeros((20, 20)), np.zeros((20, 20))
        
        return occ_matrix, amt_matrix

    def create_feature_window(self,
                            timestamp_or_date,
                            window_size: int,
                            threshold: int = 0,
                            aggregation_allowed: bool = True,
                            include_price: bool = True,
                            include_occ: bool = True,
                            include_amt: bool = True) -> np.ndarray:
        """Create feature window for period"""
        if not include_occ and not include_amt:
            raise ValueError("Must include at least one of occurrence or amount matrices")
            
        # Get time periods for window
        if self.data_frequency == 'daily':
            end_date = pd.to_datetime(timestamp_or_date)
            periods = pd.date_range(end=end_date, periods=window_size, freq='D')
        else:
            end_ts = int(timestamp_or_date)
            timestamps = np.arange(
                end_ts - (window_size-1) * 3600,
                end_ts + 3600,
                3600
            )
            periods = timestamps

        # Initialize feature arrays
        prices = []
        occ_data = None
        amt_data = None
        
        for period in periods:
            # Get matrices for period
            occ_matrix, amt_matrix = self.get_matrices_for_period(period, threshold)
            
            if include_price:
                if self.data_frequency == 'daily':
                    period_dt = period
                    try:
                        price = self.price_data.loc[period_dt]['price']
                        prices.append(price)
                    except KeyError:
                        continue
                else:
                    hour_ts = int(period) - (int(period) % 3600)
                    matching_prices = self.price_data[self.price_data['hour_timestamp'] == hour_ts]
                    if not matching_prices.empty:
                        prices.append(matching_prices.iloc[0]['price'])
            
            # Process occurrence matrix
            if include_occ:
                flat_occ = occ_matrix.flatten()
                if aggregation_allowed:
                    if occ_data is None:
                        occ_data = flat_occ
                    else:
                        occ_data += flat_occ
                else:
                    if occ_data is None:
                        occ_data = flat_occ.reshape(1, -1)
                    else:
                        occ_data = np.vstack([occ_data, flat_occ.reshape(1, -1)])
            
            # Process amount matrix
            if include_amt:
                flat_amt = amt_matrix.flatten()
                if aggregation_allowed:
                    if amt_data is None:
                        amt_data = flat_amt
                    else:
                        amt_data += flat_amt
                else:
                    if amt_data is None:
                        amt_data = flat_amt.reshape(1, -1)
                    else:
                        amt_data = np.vstack([amt_data, flat_amt.reshape(1, -1)])

        # Combine features
        features = []
        if include_occ and occ_data is not None:
            features.append(occ_data)
        if include_amt and amt_data is not None:
            features.append(amt_data)
            
        if not features:
            return np.array([])
            
        if aggregation_allowed:
            combined = np.concatenate(features)
            if include_price and prices:
                return np.append(combined, np.mean(prices))
            return combined
        else:
            combined = np.column_stack(features) if len(features) > 1 else features[0]
            if include_price and prices:
                return np.column_stack([combined, prices])
            return combined

In [12]:
# Test code
hourly_preprocessor = ChainletPreprocessor(
    price_file_path='1h_interval_price_data.txt',
    occ_file_path='2024_output_matrices.txt',
    data_frequency='hourly'
)

# Test with different combinations
# All features
hourly_features = hourly_preprocessor.create_feature_window(
    1704068978,  # Example timestamp
    window_size=24,
    threshold=0,
    include_price=True,
    include_occ=True,
    include_amt=True
)
print("All features shape:", hourly_features.shape)

# Just matrices without price
hourly_features_no_price = hourly_preprocessor.create_feature_window(
    1704068978,
    window_size=24,
    threshold=0,
    include_price=False,
    include_occ=True,
    include_amt=True
)
print("No price features shape:", hourly_features_no_price.shape)

All features shape: (801,)
No price features shape: (800,)


In [13]:
daily_preprocessor = ChainletPreprocessor(
    start_year=2012,
    end_year=2016,
    price_file_path='price_data.csv',
    occ_file_path='dailyOccmatrices.txt',
    amt_file_path='dailyAmmatrices.txt',
    data_frequency='daily'
)

daily_features = daily_preprocessor.create_feature_window(
    timestamp_or_date='2012-02-01',
    window_size=24,
    threshold=0,
    include_price=False,
    include_occ=True,
    include_amt=True
)
daily_features.shape

(800,)

# Absolute Price Prediction

In [14]:
class BaseModel:
    def __init__(self, params=None):
        self.params = {
            'learning_rate': 0.01,
            'batch_size': 2,
            'hidden_dim_1': 8,
            'hidden_dim_2': 8,
            'dropout': 0.2,
            'num_epochs': 100,
            'display_step': 10
        } if params is None else params
        
        self.model = None
        self.scaler = MinMaxScaler()
        
    def build_model(self, input_shape):
        raise NotImplementedError
        
    def train(self, train_X, train_y):
        raise NotImplementedError
        
    def predict(self, test_X):
        raise NotImplementedError

In [15]:
class CoinWorksRNN(BaseModel):
    def __init__(self, params=None):
        super().__init__(params)
        self.model = None
        self.stateful_model = None  # For predictions
        
    def get_nn(self, window_size, batch_size):
        """Recreate CoinWorks' exact architecture"""
        model = Sequential([
            LSTM(self.params['hidden_dim_1'], 
                 batch_input_shape=(batch_size, 1, window_size),
                 stateful=True,
                 return_sequences=True),
            Dropout(self.params['dropout']),
            LSTM(self.params['hidden_dim_2'],
                 batch_input_shape=(batch_size, 1, window_size),
                 stateful=True),
            Dropout(self.params['dropout']),
            Dense(self.params['prediction_horizon'])
        ])
        model.compile(loss='mean_squared_error', optimizer='adam')
        return model
    
    def re_define_model(self, model, window_size, batch_size):
        """Create prediction model with batch_size=1"""
        new_model = self.get_nn(window_size, batch_size)
        old_weights = model.get_weights()
        new_model.set_weights(old_weights)
        new_model.compile(loss='mean_squared_error', optimizer='adam')
        return new_model
    
    def train(self, train_X, train_y):
        """Follow CoinWorks' training procedure"""
        window_size = train_X.shape[2]
        batch_size = self.params['batch_size']
        
        # Initialize training model
        self.model = self.get_nn(window_size, batch_size)
        
        # Training loop with state resets
        for i in range(self.params['num_epochs']):
            self.model.fit(train_X, train_y,
                         epochs=1,
                         batch_size=batch_size,
                         verbose=2)
            self.model.reset_states()
            
        # Create stateful model for predictions
        self.stateful_model = self.re_define_model(self.model, window_size, 1)
    
    def predict(self, test_X):
        """Make predictions following CoinWorks' approach"""
        predicted_list = []
        
        for i in range(len(test_X)):
            x = test_X[i:i+1]  # Keep batch dimension
            predicted = self.stateful_model.predict(x, batch_size=1)
            predicted_list.append(predicted)
            self.stateful_model.reset_states()
            
        return np.array(predicted_list).reshape(-1, self.params['prediction_horizon'])

In [16]:
class PricePredictionPipeline:
    def __init__(self, model_class, preprocessor, params=None):
        self.params = {
            'learning_rate': 0.01,
            'batch_size': 2,
            'hidden_dim_1': 8,
            'hidden_dim_2': 8,
            'dropout': 0.2,
            'num_epochs': 100,
            'display_step': 10,
            'prediction_horizon': 5,  # Added this parameter
            'window_size': 60         # Added this parameter
        } if params is None else {**params}
        
        self.model = model_class(self.params)
        self.preprocessor = preprocessor
        self.scaler = MinMaxScaler()
        
    def prepare_data(self, start_date, end_date):
        """
        Prepare data using CoinWorks' windowing approach
        Returns X shaped (n_samples, batch_size, window_size) and 
        y shaped (n_samples, prediction_horizon)
        """
        window_size = self.params['window_size']
        prediction_horizon = self.params['prediction_horizon']
        
        # Get features for entire period
        features_list = []
        target_list = []
        dates_list = []
        
        current_date = start_date
        while current_date + pd.Timedelta(days=prediction_horizon) <= end_date:
            # Get window data
            feature_window = self.preprocessor.create_feature_window(
                current_date,
                window_size=window_size,
                include_price=True
            )
            
            if feature_window.size > 0:
                # Get target prices for prediction horizon
                target_prices = []
                for i in range(prediction_horizon):
                    target_date = current_date + pd.Timedelta(days=i+1)
                    try:
                        price = self.preprocessor.price_data.loc[target_date]['price']
                        target_prices.append(price)
                    except KeyError:
                        continue
                
                if len(target_prices) == prediction_horizon:
                    features_list.append(feature_window[:-1])  # Exclude current price from features
                    target_list.append(target_prices)
                    dates_list.append(current_date)
            
            current_date += pd.Timedelta(days=1)
        
        # Convert to arrays
        X = np.array(features_list)
        y = np.array(target_list)
        dates = np.array(dates_list)
        
        # Reshape X for LSTM: (samples, timesteps, features)
        X = np.reshape(X, (X.shape[0], 1, X.shape[1]))
        
        return X, y, dates
        
    def train_and_evaluate(self, start_date, end_date):
        """Train model and evaluate using rolling predictions"""
        # Convert dates
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        
        # Prepare data
        X, y, dates = self.prepare_data(start_date, end_date)
        
        # Split into train/test
        split_idx = int(len(X) * 0.8)
        train_X, test_X = X[:split_idx], X[split_idx:]
        train_y, test_y = y[:split_idx], y[split_idx:]
        test_dates = dates[split_idx:]
        
        # Scale features
        train_X_reshaped = train_X.reshape(-1, train_X.shape[-1])
        test_X_reshaped = test_X.reshape(-1, test_X.shape[-1])
        
        X_scaler = self.scaler.fit(train_X_reshaped)
        train_X_scaled = X_scaler.transform(train_X_reshaped).reshape(train_X.shape)
        test_X_scaled = X_scaler.transform(test_X_reshaped).reshape(test_X.shape)
        
        # Train model
        self.model.train(train_X_scaled, train_y)
        
        # Get predictions
        predictions = self.model.predict(test_X_scaled)
        
        # Calculate metrics for each horizon
        metrics = []
        for i in range(self.params['prediction_horizon']):
            horizon_rmse = np.sqrt(mean_squared_error(test_y[:, i], predictions[:, i]))
            horizon_mae = mean_absolute_error(test_y[:, i], predictions[:, i])
            metrics.append({
                'horizon': i+1,
                'rmse': horizon_rmse,
                'mae': horizon_mae
            })
        
        # Create results DataFrame
        results_df = pd.DataFrame({
            'date': test_dates,
            'actual_price': test_y[:, 0],  # First horizon actual prices
            'predicted_price': predictions[:, 0]  # First horizon predictions
        })
        
        # Add predictions for other horizons
        for i in range(1, self.params['prediction_horizon']):
            results_df[f'actual_price_h{i+1}'] = test_y[:, i]
            results_df[f'predicted_price_h{i+1}'] = predictions[:, i]
        
        return results_df, metrics

In [None]:
# For daily data testing
params = {
   'learning_rate': 0.01,
   'batch_size': 2,
   'hidden_dim_1': 128,
   'hidden_dim_2': 64,
   'dropout': 0.2,
   'num_epochs': 100,
   'display_step': 10,
   'prediction_horizon': 5,
   'window_size': 60
}

# Initialize with daily data
daily_preprocessor = ChainletPreprocessor(
   price_file_path='price_data.csv',
   occ_file_path='dailyOccmatrices.txt',
   amt_file_path='dailyAmmatrices.txt', 
   data_frequency='daily',
   start_year=2016,
   end_year=2016
)

# Modify prepare_data method to handle empty feature lists
def prepare_data(self, start_date, end_date):
   window_size = self.params['window_size']
   prediction_horizon = self.params['prediction_horizon']
   
   features_list = []
   target_list = []
   dates_list = []
   
   current_date = pd.to_datetime(start_date)
   end_date = pd.to_datetime(end_date)
   
   while current_date + pd.Timedelta(days=prediction_horizon) <= end_date:
       feature_window = self.preprocessor.create_feature_window(
           current_date,
           window_size=window_size,
           include_price=True
       )
       
       if feature_window.size > 0:  # Only append if we got valid features
           features_list.append(feature_window[:-1])
           
           # Get target prices
           target_prices = []
           for i in range(prediction_horizon):
               target_date = current_date + pd.Timedelta(days=i+1)
               try:
                   price = self.preprocessor.price_data.loc[target_date]['price']
                   target_prices.append(price)
               except KeyError:
                   break
                   
           if len(target_prices) == prediction_horizon:
               target_list.append(target_prices)
               dates_list.append(current_date)
               
       current_date += pd.Timedelta(days=1)
   
   if not features_list:  # Check if we got any valid data
       raise ValueError("No valid features found in date range")
       
   X = np.array(features_list)
   y = np.array(target_list)
   dates = np.array(dates_list)
   
   # Reshape X for LSTM 
   X = X.reshape(X.shape[0], 1, -1) 
   
   return X, y, dates

pipeline = PricePredictionPipeline(
   model_class=CoinWorksRNN,
   preprocessor=daily_preprocessor,
   params=params
)

results_df, metrics = pipeline.train_and_evaluate(
   start_date='2016-01-01',
   end_date='2016-12-31'
)