In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


from arch import arch_model
from sklearn.metrics import mean_absolute_error


df = pd.read_csv("/Users/kartik/github/cryptocurrency_market_analysis/merged_crypto_2023_2024.csv", parse_dates=["time"])
df.head()

Unnamed: 0,time,coin,open,high,low,close,volumefrom,volumeto
0,2023-01-01,sol,9.952,10.08,9.7,9.99,3717429.17,36647100.0
1,2023-01-02,sol,9.99,11.97,9.791,11.28,6930407.48,76282640.0
2,2023-01-03,sol,11.28,13.8,11.01,13.39,10274568.45,129021600.0
3,2023-01-04,sol,13.39,14.22,12.82,13.44,10543054.16,142824300.0
4,2023-01-05,sol,13.44,13.86,12.95,13.43,4946914.33,65885390.0


In [2]:
df['target'] = (df.groupby('coin')['close'].shift(-1) > df['close']).astype(int)
df['coin_code'] = df['coin'].astype('category').cat.codes

# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
coin_encoded = encoder.fit_transform(df[['coin']])
coin_df = pd.DataFrame(coin_encoded, columns=encoder.get_feature_names_out(['coin']))

# Add to main dataframe
df = pd.concat([df, coin_df], axis=1)


In [3]:
df.columns

Index(['time', 'coin', 'open', 'high', 'low', 'close', 'volumefrom',
       'volumeto', 'target', 'coin_code', 'coin_ada', 'coin_bnb', 'coin_btc',
       'coin_doge', 'coin_eth', 'coin_sol', 'coin_trx', 'coin_usdc',
       'coin_usdt', 'coin_xrp'],
      dtype='object')

In [4]:
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands
from ta.volume import VolumeWeightedAveragePrice

#Calculate RSI properly for each coin
def calculate_rsi(group):
    rsi = RSIIndicator(close=group['close'], window=14)
    return rsi.rsi()

df['rsi_14'] = df.groupby('coin', group_keys=False).apply(calculate_rsi)

In [5]:
def calculate_technical_indicators(group):
    # MACD
    macd = MACD(close=group['close'])
    group['macd_line'] = macd.macd()
    group['macd_signal'] = macd.macd_signal()
    
    # Bollinger Bands
    bb = BollingerBands(close=group['close'])
    group['bb_upper'] = bb.bollinger_hband()
    group['bb_middle'] = bb.bollinger_mavg()
    group['bb_lower'] = bb.bollinger_lband()
    
    # Volume Weighted Average Price
    vwap = VolumeWeightedAveragePrice(
        high=group['high'],
        low=group['low'],
        close=group['close'],
        volume=group['volumeto'],
        window=14
    )
    group['vwap'] = vwap.volume_weighted_average_price()
    
    return group

df = df.groupby('coin', group_keys=False).apply(calculate_technical_indicators)


In [6]:
def create_lagged_features(group):
    # Daily returns
    group['daily_return'] = group['close'].pct_change()
    
    # Lagged returns
    for lag in [1, 3, 7]:
        group[f'return_lag_{lag}'] = group['daily_return'].shift(lag)
    
    # Volume features
    group['volume_ma_7'] = group['volumeto'].rolling(7).mean()
    group['volume_zscore'] = (group['volumeto'] - group['volume_ma_7']) / group['volumeto'].std()
    
    return group

df = df.groupby('coin', group_keys=False).apply(create_lagged_features)


In [7]:
df['time'] = pd.to_datetime(df['time'])
df.set_index('time', inplace=True)
df['daily_return'] = df.groupby('coin')['close'].pct_change()
df.head()

Unnamed: 0_level_0,coin,open,high,low,close,volumefrom,volumeto,target,coin_code,coin_ada,...,bb_upper,bb_middle,bb_lower,vwap,daily_return,return_lag_1,return_lag_3,return_lag_7,volume_ma_7,volume_zscore
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,sol,9.952,10.08,9.7,9.99,3717429.17,36647100.0,1,5,0.0,...,,,,,,,,,,
2023-01-02,sol,9.99,11.97,9.791,11.28,6930407.48,76282640.0,1,5,0.0,...,,,,,0.129129,,,,,
2023-01-03,sol,11.28,13.8,11.01,13.39,10274568.45,129021600.0,1,5,0.0,...,,,,,0.187057,0.129129,,,,
2023-01-04,sol,13.39,14.22,12.82,13.44,10543054.16,142824300.0,0,5,0.0,...,,,,,0.003734,0.187057,,,,
2023-01-05,sol,13.44,13.86,12.95,13.43,4946914.33,65885390.0,1,5,0.0,...,,,,,-0.000744,0.003734,0.129129,,,


In [8]:
df['coin_encoded'] = df['coin'].astype('category').cat.codes

# For linear models, use OneHotEncoder:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
coin_encoded = encoder.fit_transform(df[['coin']])

In [9]:
#Model-specific feature preparation
## Common features
base_features = ['rsi_14', 'daily_return', 'volumeto', 'vwap']

## Logistic Regression Features
lr_features = base_features + ['macd_line', 'macd_signal']

## Decision Tree Features
dt_features = base_features + ['bb_upper', 'bb_lower', 'volume_zscore']

## Random Forest Features
rf_features = base_features + [f'return_lag_{lag}' for lag in [1, 3]] + ['macd_line']

## XGBoost Features
xgb_features = base_features + [f'return_lag_{lag}' for lag in [1, 3, 7]] + ['bb_upper', 'bb_lower']

## LSTM Features
lstm_features = ['close', 'volumeto', 'rsi_14', 'macd_line']

In [10]:
# Handle missing values (from indicator calculations)
df = df.dropna()

In [11]:
print("Sample RSI values:")
print(df[['coin', 'close', 'rsi_14']].head(10))

print("\nMissing values after processing:")
print(df.isnull().sum())

Sample RSI values:
           coin  close     rsi_14
time                             
2023-02-03  sol  24.68  61.611431
2023-02-04  sol  24.36  60.050932
2023-02-05  sol  23.49  55.905137
2023-02-06  sol  22.69  52.327675
2023-02-07  sol  23.84  56.624614
2023-02-08  sol  23.22  53.808672
2023-02-09  sol  20.47  43.480210
2023-02-10  sol  20.13  42.396667
2023-02-11  sol  20.82  45.371928
2023-02-12  sol  21.51  48.250445

Missing values after processing:
coin             0
open             0
high             0
low              0
close            0
volumefrom       0
volumeto         0
target           0
coin_code        0
coin_ada         0
coin_bnb         0
coin_btc         0
coin_doge        0
coin_eth         0
coin_sol         0
coin_trx         0
coin_usdc        0
coin_usdt        0
coin_xrp         0
rsi_14           0
macd_line        0
macd_signal      0
bb_upper         0
bb_middle        0
bb_lower         0
vwap             0
daily_return     0
return_lag_1     0
return_

In [12]:
split_date = '2024-06-01'
train_mask = df.index < split_date
test_mask = df.index >= split_date

X_train = df[train_mask]
X_test = df[test_mask]
y_train = df[train_mask]['target']
y_test = df[test_mask]['target']

In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import os
import joblib

In [18]:
## 1 Logistic Regression

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), ['rsi_14', 'volumeto', 'macd_line']),
    ('passthrough', 'passthrough', coin_df.columns)  # Keep one-hot encoded coins as-is
])

lr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=1000))
])
lr_pipe.fit(X_train, y_train)
joblib.dump(lr_pipe, 'models/lr_model.pkl')

['models/lr_model.pkl']

In [20]:
#Decision Tree
dt_model = DecisionTreeClassifier(
    max_depth=7,
    min_samples_leaf=10,
    ccp_alpha=0.01
)
dt_model.fit(X_train[base_features + ['coin_code']], y_train)  # Using categorical code
joblib.dump(dt_model, 'models/dt_model.pkl')

['models/dt_model.pkl']

In [21]:
#Random forest model
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_features='sqrt',
    min_samples_leaf=5
)
rf_model.fit(X_train[base_features + ['coin_code']], y_train)
joblib.dump(rf_model, 'models/rf_model.pkl')

['models/rf_model.pkl']

In [25]:
#xgboost model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    enable_categorical=True,  # Handles categorical codes
    tree_method='hist'
)
xgb_model.fit(
    X_train[base_features + ['coin_code']],
    y_train,
    eval_set=[(X_test[base_features + ['coin_code']], y_test)]
)
joblib.dump(xgb_model, 'models/xgb_model.pkl')

[0]	validation_0-logloss:0.66537
[1]	validation_0-logloss:0.65200
[2]	validation_0-logloss:0.64774
[3]	validation_0-logloss:0.64482
[4]	validation_0-logloss:0.64589
[5]	validation_0-logloss:0.64453
[6]	validation_0-logloss:0.64632
[7]	validation_0-logloss:0.64921
[8]	validation_0-logloss:0.64996
[9]	validation_0-logloss:0.65090
[10]	validation_0-logloss:0.65280
[11]	validation_0-logloss:0.65418
[12]	validation_0-logloss:0.65516
[13]	validation_0-logloss:0.65522
[14]	validation_0-logloss:0.65828
[15]	validation_0-logloss:0.65856
[16]	validation_0-logloss:0.66108
[17]	validation_0-logloss:0.66123
[18]	validation_0-logloss:0.66254
[19]	validation_0-logloss:0.66323
[20]	validation_0-logloss:0.66498
[21]	validation_0-logloss:0.66551
[22]	validation_0-logloss:0.66737
[23]	validation_0-logloss:0.66819
[24]	validation_0-logloss:0.67212
[25]	validation_0-logloss:0.67213
[26]	validation_0-logloss:0.67516
[27]	validation_0-logloss:0.67698
[28]	validation_0-logloss:0.67671
[29]	validation_0-loglos

['models/xgb_model.pkl']

In [None]:
num_coins = df['coin_code'].nunique()
from tensorflow.keras.layers import Embedding

lstm_model = Sequential([
    Embedding(input_dim=num_coins, output_dim=3, input_length=1),
    LSTM(64),
    Dense(1, activation='sigmoid')
])



In [34]:
lstm_model.save('models/lstm_model.h5')




In [35]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import (accuracy_score, precision_score, 
                           recall_score, f1_score, confusion_matrix,
                           roc_auc_score, RocCurveDisplay)
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

def evaluate_model(model, X_test, y_test, model_type='standard'):
    """
    Evaluate a pre-trained model on test data
    
    Args:
        model: Pre-trained model object
        X_test: Test features
        y_test: True labels
        model_type: 'standard' (sklearn) or 'lstm'
        
    Returns:
        Dictionary of metrics
        Array of predictions
    """
    start_time = time.time()
    
    if model_type == 'lstm':
        # LSTM-specific prediction
        seq_length = model.input_shape[1]
        test_gen = TimeseriesGenerator(
            X_test, y_test,
            length=seq_length,
            batch_size=len(X_test)
        )
        y_proba = model.predict(test_gen[0][0]).flatten()
        y_pred = (y_proba > 0.5).astype(int)
    else:
        # Standard sklearn-style prediction
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, 'predict_proba') else None
    
    pred_time = time.time() - start_time
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'inference_time': pred_time
    }
    
    if y_proba is not None:
        metrics['auc'] = roc_auc_score(y_test, y_proba)
    
    return metrics, y_pred

# Load pre-trained models
models = {
    'Decision Tree': joblib.load('models/dt_model.pkl'),
    'Random Forest': joblib.load('models/rf_model.pkl'),
    'XGBoost': joblib.load('models/xgb_model.pkl'),
    'LSTM': load_model('models/lstm_model.h5')
}

# Initialize results storage
results = {}
predictions = {}

# Evaluate each model
for name, model in models.items():
    model_type = 'lstm' if name == 'LSTM' else 'standard'
    metrics, y_pred = evaluate_model(model, X_test, y_test, model_type)
    results[name] = metrics
    predictions[name] = y_pred

# Create comparison DataFrame
comparison_df = pd.DataFrame(results).T
print("Model Comparison:")
print(comparison_df)

# Visualization
plt.figure(figsize=(15, 10))

# Metric Comparison
plt.subplot(2, 2, 1)
metric_cols = ['accuracy', 'precision', 'recall', 'f1', 'auc']
comparison_df[metric_cols].plot(kind='bar', ax=plt.gca())
plt.title('Model Performance Metrics')
plt.ylabel('Score')
plt.xticks(rotation=45)

# Runtime Comparison
plt.subplot(2, 2, 2)
comparison_df[['inference_time']].plot(kind='bar', ax=plt.gca())
plt.title('Inference Time (seconds)')
plt.ylabel('Time (s)')
plt.xticks(rotation=45)

# Confusion Matrices
for i, (name, y_pred) in enumerate(predictions.items(), 3):
    plt.subplot(2, 2, i)
    cm = confusion_matrix(y_test, y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{name} Confusion Matrix')
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
plt.tight_layout()
plt.show()



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- bb_lower
- bb_middle
- bb_upper
- close
- coin
- ...
