In [15]:
# ----------------------------
# Imports
# ----------------------------
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
import joblib
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

# ----------------------------
# Load saved models and preprocessing objects
# ----------------------------
forest_model = joblib.load('forest_model.pkl')
xgb_model = joblib.load('xgb_model.pkl')
label_encoder = joblib.load('label_encoder.pkl')
scaler_tabular = joblib.load('scaler_tabular.pkl')
scaler_long = joblib.load('scaler_long.pkl')
neural_model = load_model('neural_model.keras')  # use .keras if saved in new format

# ----------------------------
# Load your data
# ----------------------------
%store -r export_df
df = export_df

%store -r export_df_long
df_long = export_df_long

# ----------------------------
# Prepare test data
# ----------------------------
x = df.drop(columns=['activity'])
y = df['activity']

x_long = df_long.drop(columns=['activity'])
y_long = df_long['activity']

# Split data (keeping last 100 rows for evaluation)
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=42, shuffle=False
)

# Last 100 rows for consistent ensemble evaluation
x_test_subset = x_test.tail(100)
y_test_subset = y_test.tail(100)
x_long_test_subset = x_long.tail(100)
y_long_test_subset = y_long.tail(100)

# Scale features
x_test_scaled_subset = scaler_tabular.transform(x_test_subset)
x_long_test_scaled_subset = scaler_long.transform(x_long_test_subset)

# Encode labels numerically
y_test_encoded_subset = label_encoder.transform(y_test_subset)
y_long_test_encoded_subset = label_encoder.transform(y_long_test_subset)

# ----------------------------
# Make predictions
# ----------------------------
# Random Forest
predictions_rf = forest_model.predict(x_test_scaled_subset)

# XGBoost
proba_xgb = xgb_model.predict_proba(x_test_scaled_subset)
predictions_xgb = np.argmax(proba_xgb, axis=1)

# Neural Network
y_pred_probs_nn = neural_model.predict(x_long_test_scaled_subset)
predictions_neural = np.argmax(y_pred_probs_nn, axis=1)

# Random Forest (already outputs strings if trained on original labels)
# If you trained RF on encoded labels, decode:
predictions_rf_decoded = label_encoder.inverse_transform(predictions_rf)  # only if RF trained on encoded

# XGBoost (currently outputs integers)
predictions_xgb_decoded = label_encoder.inverse_transform(predictions_xgb)

# NeuralNet
predictions_neural_decoded = label_encoder.inverse_transform(predictions_neural)

# Now all three are string labels, same type as y_test_subset


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


  saveable.load_own_variables(weights_store.get(inner_path))


In [16]:
# ----------------------------
# Combine predictions into one DataFrame
# ----------------------------
results_full = x_test_subset.copy()
results_full['Actual'] = y_test_subset.values
results_full['RandomForest_Pred'] = predictions_rf_decoded
results_full['XGBoost_Pred'] = predictions_xgb_decoded
results_full['NeuralNet_Pred'] = predictions_neural_decoded

In [17]:

# ----------------------------
# Apply majority voting
# ----------------------------
def majority_vote(row):
    preds = [row['RandomForest_Pred'], row['XGBoost_Pred'], row['NeuralNet_Pred']]
    return Counter(preds).most_common(1)[0][0]

results_full['Voted_Pred'] = results_full.apply(majority_vote, axis=1)

# ----------------------------
# Compute accuracy metrics
# ----------------------------
acc_rf = accuracy_score(results_full['Actual'], results_full['RandomForest_Pred'])
acc_xgb = accuracy_score(results_full['Actual'], results_full['XGBoost_Pred'])
acc_nn = accuracy_score(results_full['Actual'], results_full['NeuralNet_Pred'])
acc_vote = accuracy_score(results_full['Actual'], results_full['Voted_Pred'])

print(f"RandomForest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy:      {acc_xgb:.4f}")
print(f"NeuralNet Accuracy:    {acc_nn:.4f}")
print(f"Voting Accuracy:       {acc_vote:.4f}")

# ----------------------------
# Agreement among models
# ----------------------------
results_full['Agreement'] = (
    (results_full['RandomForest_Pred'] == results_full['XGBoost_Pred']) &
    (results_full['XGBoost_Pred'] == results_full['NeuralNet_Pred'])
)
agreement_rate = results_full['Agreement'].mean()
print(f"Models fully agree on {agreement_rate*100:.2f}% of cases")

# ----------------------------
# Per-model correctness
# ----------------------------
results_full['RF_Correct'] = results_full['RandomForest_Pred'] == results_full['Actual']
results_full['XGB_Correct'] = results_full['XGBoost_Pred'] == results_full['Actual']
results_full['NN_Correct'] = results_full['NeuralNet_Pred'] == results_full['Actual']
results_full['Vote_Correct'] = results_full['Voted_Pred'] == results_full['Actual']

# ----------------------------
# Save results to CSV
# ----------------------------
results_full.to_csv("model_voting_analysis.csv", index=False)
print("✅ Saved results with voting as 'model_voting_analysis.csv'")

# ----------------------------
# Prediction correlation matrix
# ----------------------------
pred_matrix = results_full[['RandomForest_Pred', 'XGBoost_Pred', 'NeuralNet_Pred']]
print("\nPrediction Correlation Matrix (RF vs XGB):")
print(pd.crosstab(pred_matrix['RandomForest_Pred'], pred_matrix['XGBoost_Pred'], normalize='index'))

RandomForest Accuracy: 0.6800
XGBoost Accuracy:      0.7000
NeuralNet Accuracy:    0.6800
Voting Accuracy:       0.6900
Models fully agree on 73.00% of cases
✅ Saved results with voting as 'model_voting_analysis.csv'

Prediction Correlation Matrix (RF vs XGB):
XGBoost_Pred            Buy      Sell
RandomForest_Pred                    
Buy                0.884615  0.115385
Sell               0.208333  0.791667
