## This script is meant for trying new ML methods on the same data, adjusting features, and experimentation. 

Deep learning architectures such as LSTM, Transformer-based models, and Prophet were also evaluated.
However, their performance (MAE ~ 40,000) was significantly worse than classical regression approaches.
This is primarily due to the limited data available at daily frequency and the weak temporal dependencies in Bitcoinâ€™s daily closing prices.
Simpler models like Linear and Ridge Regression performed best, suggesting that the signal is mostly short-memory and linear in nature.

In [4]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
#from statsmodels.tsa.arima.model import ARIMA
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

def calculate_rsi(data, window=14):
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss.replace(0, 1e-6)
    return 100 - (100 / (1 + rs))

# --- Feature Engineering ---
def feature_engineering(df):
    print("--- Performing Feature Engineering ---")
    if len(df) < 15:
        raise ValueError(f"Dataframe size too small ({len(df)} rows).")

    df_feat = df.copy()

    # Core technicals
    df_feat['rsi_14'] = calculate_rsi(df_feat, window=14)
    df_feat['lag_1'] = df_feat['close'].shift(1)
    df_feat['lag_5'] = df_feat['close'].shift(5)
    df_feat['lag_10'] = df_feat['close'].shift(10)
    df_feat['rolling_mean_5'] = df_feat['close'].rolling(window=5).mean()
    df_feat['rolling_std_5'] = df_feat['close'].rolling(window=5).std()
    df_feat['rolling_mean_10'] = df_feat['close'].rolling(window=10).mean()
    df_feat['rolling_std_10'] = df_feat['close'].rolling(window=10).std()

    # Log return volatility / momentum
    log_return = np.log(df_feat['close'] / df_feat['close'].shift(1))
    df_feat['volatility_7'] = log_return.rolling(window=7).std()
    df_feat['momentum_5'] = log_return.rolling(window=5).mean()

    # Derived interaction features
    df_feat['high_low_spread'] = df_feat['high'] - df_feat['low']
    df_feat['momentum_x_volume'] = df_feat['momentum_5'] * df_feat['volume']
    df_feat['rsi_sq'] = df_feat['rsi_14'] ** 2
    df_feat['volatility_7_sq'] = df_feat['volatility_7'] ** 2

    # Delta / deviation features
    df_feat['close_delta_5'] = df_feat['close'] - df_feat['close'].shift(5)
    df_feat['log_return_abs'] = np.abs(log_return)
    df_feat['high_low_vol_ratio'] = df_feat['high_low_spread'] / (df_feat['rolling_std_10'] + 1e-6)

    # Adaptive rolling signals (EWMA)
    df_feat['ewma_5'] = df_feat['close'].ewm(span=5, adjust=False).mean()
    df_feat['ewma_10'] = df_feat['close'].ewm(span=10, adjust=False).mean()
    df_feat['ewma_ratio'] = df_feat['ewma_5'] / (df_feat['ewma_10'] + 1e-6)

    # Day of week effect
    df_feat['day_of_week'] = df_feat['merge_date'].dt.dayofweek

    return df_feat

df = pd.read_csv(r'C:\Users\baile\Documents\Artificial Intelligence\BitcoinPred\standalone_training\bitcoin_sentiment_12012022_11082025.csv')
df['target'] = df['close'].shift(-1)
df['merge_date'] = pd.to_datetime(df['merge_date'], errors='coerce')
df = feature_engineering(df)

# --- Define and Clean ---
features = [col for col in df.columns if col not in ['merge_date', 'datetime_utc', 'timestamp', 'target'] and 'unnamed' not in col]
required_cols = features + ['target']
data_clean = df.dropna(subset=required_cols).reset_index(drop=True)
if data_clean.empty:
    raise ValueError("Empty after cleaning. Check for NaN-heavy feature creation.")

X = data_clean[features]
y = data_clean['target'] # Use the new 'target' column

# --- 3. Split and Train ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

selected_features = [
    'open', 'high', 'low', 'close', 'volume', 'weighted_sentiment',
    'lag_1', 'lag_5', 'lag_10', 'rolling_mean_5', 'rolling_mean_10',
    'rolling_std_10', 'volatility_7', 'volatility_7_sq',
    'momentum_5', 'momentum_x_volume', 'rsi_sq', 'close_delta_5',
    'ewma_ratio', 'high_low_spread', 'high_low_vol_ratio', 'day_of_week'
]

X_train = X_train[selected_features]   
X_test = X_test[selected_features]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
########################

print("--- Training and Evaluating Models ---")
results = {}


# 4. Train ML Models
mlr_model = LinearRegression().fit(X_train_scaled, y_train)
ridge_model = Ridge(alpha=0.5, random_state=42).fit(X_train_scaled, y_train)
rf_model = RandomForestRegressor(n_estimators=600, random_state=42).fit(X_train_scaled, y_train)
xgb_model = xgb.XGBRegressor(colsample_bytree=1.0, learning_rate=0.03, max_depth= 7,n_estimators=1200, subsample = 0.9,
                             objective='reg:squarederror', random_state=42).fit(X_train_scaled, y_train)

# Train Auto-ARIMA on the training data for a fair comparison
#arima_model = ARIMA(y_train, order=(1,1,3))
#arima_model_fit = arima_model.fit()

# Evaluate Models
mlr_preds = mlr_model.predict(X_test_scaled)
ridge_preds = ridge_model.predict(X_test_scaled)
rf_preds = rf_model.predict(X_test_scaled)
xgb_preds = xgb_model.predict(X_test_scaled)
#arima_preds = arima_model_fit.forecast(steps=len(y_test))
# --- Residual Correction using XGBoost ---
print("\n--- Residual Correction with XGBoost ---")
linear_residuals = y_train - mlr_model.predict(X_train_scaled)
residual_model = xgb.XGBRegressor(
    learning_rate=0.03, max_depth=7, n_estimators=800, subsample=0.9,
    colsample_bytree=0.9, objective='reg:squarederror', random_state=42
)
residual_model.fit(X_train_scaled, linear_residuals)
residual_corrections = residual_model.predict(X_test_scaled)
ridge_corrected_preds = mlr_preds + residual_corrections


# --- Evaluation ---
results = {
    'Linear Regression': mean_absolute_error(y_test, mlr_preds),
    'Ridge Regression (0.5)': mean_absolute_error(y_test, ridge_preds),
    'Random Forest': mean_absolute_error(y_test, rf_preds),
    'XGBoost': mean_absolute_error(y_test, xgb_preds),
    'LR + Residual XGBoost': mean_absolute_error(y_test, ridge_corrected_preds)
}

results_df = pd.DataFrame(results.items(), columns=['Model', 'MAE']).sort_values(by='MAE')
print("\n--- Model Performance on Test Set ---")
print(results_df)

--- Performing Feature Engineering ---
--- Training and Evaluating Models ---

--- Residual Correction with XGBoost ---

--- Model Performance on Test Set ---
                    Model          MAE
1  Ridge Regression (0.5)  1513.408994
0       Linear Regression  1531.126675
4   LR + Residual XGBoost  2818.299613
2           Random Forest  8628.666786
3                 XGBoost  9582.581327


### This block of code is for calling the training function in the main_scripts folder.

In [1]:
import sys
import pandas as pd
import numpy as np
sys.path.append(r'C:\Users\baile\Documents\Artificial Intelligence\BitcoinPred')
from main_scripts.train import train_and_evaluate
# --- Feature Engineering ---

def calculate_rsi(data, window=14):
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss.replace(0, 1e-6)
    return 100 - (100 / (1 + rs))
def feature_engineering(df):
    print("--- Performing Feature Engineering ---")
    if len(df) < 15:
        raise ValueError(f"Dataframe size too small ({len(df)} rows).")

    df_feat = df.copy()

    # Core technicals
    df_feat['rsi_14'] = calculate_rsi(df_feat, window=14)
    df_feat['lag_1'] = df_feat['close'].shift(1)
    df_feat['lag_5'] = df_feat['close'].shift(5)
    df_feat['lag_10'] = df_feat['close'].shift(10)
    df_feat['rolling_mean_5'] = df_feat['close'].rolling(window=5).mean()
    df_feat['rolling_std_5'] = df_feat['close'].rolling(window=5).std()
    df_feat['rolling_mean_10'] = df_feat['close'].rolling(window=10).mean()
    df_feat['rolling_std_10'] = df_feat['close'].rolling(window=10).std()

    # Log return volatility / momentum
    log_return = np.log(df_feat['close'] / df_feat['close'].shift(1))
    df_feat['volatility_7'] = log_return.rolling(window=7).std()
    df_feat['momentum_5'] = log_return.rolling(window=5).mean()

    # Derived interaction features
    df_feat['high_low_spread'] = df_feat['high'] - df_feat['low']
    df_feat['momentum_x_volume'] = df_feat['momentum_5'] * df_feat['volume']
    df_feat['rsi_sq'] = df_feat['rsi_14'] ** 2
    df_feat['volatility_7_sq'] = df_feat['volatility_7'] ** 2

    # Delta / deviation features
    df_feat['close_delta_5'] = df_feat['close'] - df_feat['close'].shift(5)
    df_feat['log_return_abs'] = np.abs(log_return)
    df_feat['high_low_vol_ratio'] = df_feat['high_low_spread'] / (df_feat['rolling_std_10'] + 1e-6)

    # Adaptive rolling signals (EWMA)
    df_feat['ewma_5'] = df_feat['close'].ewm(span=5, adjust=False).mean()
    df_feat['ewma_10'] = df_feat['close'].ewm(span=10, adjust=False).mean()
    df_feat['ewma_ratio'] = df_feat['ewma_5'] / (df_feat['ewma_10'] + 1e-6)

    # Day of week effect
    df_feat['day_of_week'] = df_feat['merge_date'].dt.dayofweek

    return df_feat

selected_features = [
    'open', 'high', 'low', 'close', 'volume', 'weighted_sentiment',
    'lag_1', 'lag_5', 'lag_10', 'rolling_mean_5', 'rolling_mean_10',
    'rolling_std_10', 'volatility_7', 'volatility_7_sq',
    'momentum_5', 'momentum_x_volume', 'rsi_sq', 'close_delta_5',
    'ewma_ratio', 'high_low_spread', 'high_low_vol_ratio', 'day_of_week'
]

df = pd.read_csv(r'C:\Users\baile\Documents\Artificial Intelligence\BitcoinPred\standalone_training\bitcoin_sentiment_12012022_11082025.csv')
df['target'] = df['close'].shift(-1)
df['merge_date'] = pd.to_datetime(df['merge_date'], errors='coerce')
df = feature_engineering(df)
train_and_evaluate(df, selected_features=selected_features)

2025-11-12 17:02:00,386 - INFO - -------------------------------------
2025-11-12 17:02:00,386 - INFO - 
Selected Features used for Training: ['open', 'high', 'low', 'close', 'volume', 'weighted_sentiment', 'lag_1', 'lag_5', 'lag_10', 'rolling_mean_5', 'rolling_mean_10', 'rolling_std_10', 'volatility_7', 'volatility_7_sq', 'momentum_5', 'momentum_x_volume', 'rsi_sq', 'close_delta_5', 'ewma_ratio', 'high_low_spread', 'high_low_vol_ratio', 'day_of_week']
2025-11-12 17:02:00,403 - INFO - Model: Linear Regression, Hyperparameters: {}
2025-11-12 17:02:00,403 - INFO - Model: Ridge Regression (0.5), Hyperparameters: {'alpha': 0.5}
2025-11-12 17:02:00,403 - INFO - Model: Random Forest, Hyperparameters: {'n_estimators': 600, 'random_state': 42}
2025-11-12 17:02:00,403 - INFO - Model: XGBoost, Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 1200, 'subsample': 0.9}


--- Performing Feature Engineering ---
--- Preparing Data for True Forecasting ---
--- Training and Evaluating Models ---


2025-11-12 17:02:30,403 - INFO - |                        |     MAE |
|:-----------------------|--------:|
| Ridge Regression (0.5) | 1513.41 |
| Linear Regression      | 1531.13 |
| Random Forest          | 8628.67 |
| XGBoost                | 9582.58 |



--- Model Performance on Test Set ---
                                MAE
Ridge Regression (0.5)  1513.408994
Linear Regression       1531.126675
Random Forest           8628.666786
XGBoost                 9582.581327
