In [14]:
from sklearn.metrics import classification_report, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input


%store -r export_df
df = export_df
print(df)


x = df.drop(columns=['activity'])
y = df['activity']

label_encoder = LabelEncoder() # Encode target
y_encoded = label_encoder.fit_transform(y)

scaler = StandardScaler() # Normalize features
x_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=42, shuffle=False
)
# split_index = int(len(df) * 0.9)
# x_train, x_test = x.iloc[:split_index], x.iloc[split_index:]
# y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

forest_model = RandomForestClassifier(n_estimators=500, random_state=42)
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(label_encoder.classes_),
    n_estimators=500,
    max_depth=8,
    learning_rate=0.15,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)
neural_model = Sequential([
    Input(shape=(x_train_scaled.shape[1],)),  # explicitly define input
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')  # multi-class
])

neural_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
neural_model.summary()

forest_model.fit(x_train, y_train)
xgb_model.fit(x_train, y_train_encoded)
neural_model.fit(
    x_train_scaled, y_train_encoded,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


predictions_rf = forest_model.predict(x_test)

proba = xgb_model.predict_proba(x_test)
predictions_xgb = np.argmax(proba, axis=1)
predictions_xgb_decoded = label_encoder.inverse_transform(predictions_xgb)

y_pred_probs = neural_model.predict(x_test_scaled)
predictions_neural = np.argmax(y_pred_probs, axis=1)
predictions_neural_decoded = label_encoder.inverse_transform(predictions_neural)

acc_rf = classification_report(y_test, predictions_rf)
acc_xgb = classification_report(y_test, predictions_xgb_decoded)
acc_neural = classification_report(y_test, predictions_neural_decoded)

#
# print(acc_rf)
# print(acc_xgb)
# print(acc_neural)
# print(df['activity'].value_counts())


                          open       high        low      close     volume  \
timestamp                                                                    
2025-08-18 17:15:00  116243.96  116688.00  116243.95  116516.11  274.31192   
2025-08-18 17:30:00  116516.11  116800.00  116516.10  116532.67  235.18710   
2025-08-18 17:45:00  116532.68  116625.10  116428.98  116428.99  102.21270   
2025-08-18 18:00:00  116428.98  116437.44  116300.78  116360.97   74.19460   
2025-08-18 18:15:00  116360.97  116458.35  116238.71  116360.98   65.74689   
...                        ...        ...        ...        ...        ...   
2025-10-04 19:45:00  121887.87  121959.25  121881.00  121959.25   40.40788   
2025-10-04 20:00:00  121959.25  122079.65  121891.49  121939.96  103.29350   
2025-10-04 20:15:00  121939.97  122057.28  121939.96  122057.28   32.70135   
2025-10-04 20:30:00  122057.28  122057.28  121986.29  122029.48   28.84323   
2025-10-04 20:45:00  122029.48  122061.83  121892.89  121892.89 

Epoch 1/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5202 - loss: 0.6985 - val_accuracy: 0.5343 - val_loss: 0.6975
Epoch 2/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5461 - loss: 0.6868 - val_accuracy: 0.5245 - val_loss: 0.6951
Epoch 3/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5480 - loss: 0.6860 - val_accuracy: 0.5441 - val_loss: 0.6914
Epoch 4/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5636 - loss: 0.6816 - val_accuracy: 0.4975 - val_loss: 0.6993
Epoch 5/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5663 - loss: 0.6781 - val_accuracy: 0.4926 - val_loss: 0.7008
Epoch 6/50
[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5570 - loss: 0.6789 - val_accuracy: 0.5074 - val_loss: 0.6964
Epoch 7/50
[1m115/115[0m 

In [15]:
# Combine features, actual, and predictions into one table
results_full = x_test.copy()
results_full['Actual'] = y_test.values
results_full['RandomForest_Pred'] = predictions_rf
results_full['XGBoost_Pred'] = predictions_xgb_decoded
results_full['NeuralNet_Pred'] = predictions_neural_decoded

# Save the full table to CSV
results_full.to_csv("model_predictions_full.csv", index=False)

print("✅ CSV file saved as 'model_predictions_full.csv'")


✅ CSV file saved as 'model_predictions_full.csv'


In [16]:
from collections import Counter

def majority_vote(row):
    preds = [row['RandomForest_Pred'], row['XGBoost_Pred'], row['NeuralNet_Pred']]
    most_common = Counter(preds).most_common(1)[0][0]
    return most_common

results_full['Voted_Pred'] = results_full.apply(majority_vote, axis=1)

from sklearn.metrics import accuracy_score

acc_rf = accuracy_score(results_full['Actual'], results_full['RandomForest_Pred'])
acc_xgb = accuracy_score(results_full['Actual'], results_full['XGBoost_Pred'])
acc_nn = accuracy_score(results_full['Actual'], results_full['NeuralNet_Pred'])
acc_vote = accuracy_score(results_full['Actual'], results_full['Voted_Pred'])

print(f"RandomForest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy:      {acc_xgb:.4f}")
print(f"NeuralNet Accuracy:    {acc_nn:.4f}")
print(f"Voting Accuracy:       {acc_vote:.4f}")

# Agreement among models
results_full['Agreement'] = (
    (results_full['RandomForest_Pred'] == results_full['XGBoost_Pred']) &
    (results_full['XGBoost_Pred'] == results_full['NeuralNet_Pred'])
)

agreement_rate = results_full['Agreement'].mean()
print(f"Models fully agree on {agreement_rate*100:.2f}% of cases")

# Compare per-model correctness
results_full['RF_Correct'] = results_full['RandomForest_Pred'] == results_full['Actual']
results_full['XGB_Correct'] = results_full['XGBoost_Pred'] == results_full['Actual']
results_full['NN_Correct'] = results_full['NeuralNet_Pred'] == results_full['Actual']
results_full['Vote_Correct'] = results_full['Voted_Pred'] == results_full['Actual']


results_full.to_csv("model_voting_analysis.csv", index=False)
print("✅ Saved as 'model_voting_analysis.csv'")

pred_matrix = pd.DataFrame({
    'RF': results_full['RandomForest_Pred'],
    'XGB': results_full['XGBoost_Pred'],
    'NN': results_full['NeuralNet_Pred']
})
print("\nPrediction Correlation Matrix:")
print(pd.crosstab(pred_matrix['RF'], pred_matrix['XGB'], normalize='index'))



RandomForest Accuracy: 0.5210
XGBoost Accuracy:      0.5453
NeuralNet Accuracy:    0.5188
Voting Accuracy:       0.5210
Models fully agree on 74.83% of cases
✅ Saved as 'model_voting_analysis.csv'

Prediction Correlation Matrix:
XGB        Buy      Sell
RF                      
Buy   0.679245  0.320755
Sell  0.060519  0.939481
