In [8]:
from sklearn.metrics import classification_report, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input


%store -r export_df
df = export_df

%store -r export_df_long
df_long = export_df_long
df_long.replace([np.inf, -np.inf], np.nan, inplace=True)
df_long.dropna(inplace=True)
print(df)
print(df_long)


x = df.drop(columns=['activity'])
y = df['activity']

x_long = df_long.drop(columns=['activity'])
y_long = df_long['activity']

label_encoder = LabelEncoder() # Encode target
y_encoded = label_encoder.fit_transform(y)
y_long_encoded = label_encoder.fit_transform(y_long)

scaler = StandardScaler() # Normalize features
x_scaled = scaler.fit_transform(x)
x_long_scaled = scaler.fit_transform(x_long)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=42, shuffle=False
)

x_long_train, x_long_test, y_long_train, y_long_test = train_test_split(
    x_long, y_long, test_size=0.1, random_state=42, shuffle=False
)
# split_index = int(len(df) * 0.9)
# x_train, x_test = x.iloc[:split_index], x.iloc[split_index:]
# y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

x_long_train_scaled = scaler.fit_transform(x_long_train)
x_long_test_scaled = scaler.transform(x_long_test)
y_long_train_encoded = label_encoder.transform(y_long_train)
y_long_test_encoded = label_encoder.transform(y_long_test)

forest_model = RandomForestClassifier(n_estimators=500, random_state=42)
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=len(label_encoder.classes_),
    n_estimators=500,
    max_depth=8,
    learning_rate=0.15,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)
neural_model = Sequential([
    Input(shape=(x_long_train_scaled.shape[1],)),  # explicitly define input
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')  # multi-class
])

neural_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
neural_model.summary()

forest_model.fit(x_train, y_train)
xgb_model.fit(x_train, y_train_encoded)
neural_model.fit(
    x_long_train_scaled, y_long_train_encoded,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)


predictions_rf = forest_model.predict(x_test)

proba = xgb_model.predict_proba(x_test)
predictions_xgb = np.argmax(proba, axis=1)
predictions_xgb_decoded = label_encoder.inverse_transform(predictions_xgb)

y_pred_probs = neural_model.predict(x_long_test_scaled)
predictions_neural = np.argmax(y_pred_probs, axis=1)
predictions_neural_decoded = label_encoder.inverse_transform(predictions_neural)

acc_rf = classification_report(y_test, predictions_rf)
acc_xgb = classification_report(y_test, predictions_xgb_decoded)
acc_neural = classification_report(y_long_test, predictions_neural_decoded)

#
# print(acc_rf)
# print(acc_xgb)
# print(acc_neural)
# print(df['activity'].value_counts())


                          open       high        low      close     volume  \
timestamp                                                                    
2025-09-08 02:30:00  110863.13  111372.00  110837.25  111188.61  146.90749   
2025-09-08 02:45:00  111188.61  111240.00  111103.03  111240.00   73.28767   
2025-09-08 03:00:00  111239.99  111349.24  111153.91  111237.98  153.39384   
2025-09-08 03:15:00  111237.98  111251.38  111029.43  111142.99  100.03092   
2025-09-08 03:30:00  111143.00  111346.09  111129.57  111160.01  118.69981   
...                        ...        ...        ...        ...        ...   
2025-10-08 07:45:00  121481.30  121597.06  121331.62  121597.06  142.78118   
2025-10-08 08:00:00  121597.05  121812.78  121538.48  121563.03  244.15824   
2025-10-08 08:15:00  121563.03  122000.00  121524.01  121916.41  337.31281   
2025-10-08 08:30:00  121916.42  122821.25  121916.42  122821.24  673.24438   
2025-10-08 08:45:00  122821.24  122850.00  122342.37  122381.85 

Epoch 1/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 16ms/step - accuracy: 0.6887 - loss: 0.5897 - val_accuracy: 0.7250 - val_loss: 0.5648
Epoch 2/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.6949 - loss: 0.5809 - val_accuracy: 0.7190 - val_loss: 0.5537
Epoch 3/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6951 - loss: 0.5792 - val_accuracy: 0.7297 - val_loss: 0.5464
Epoch 4/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6965 - loss: 0.5778 - val_accuracy: 0.7256 - val_loss: 0.5476
Epoch 5/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.6962 - loss: 0.5770 - val_accuracy: 0.7294 - val_loss: 0.5558
Epoch 6/50
[1m7210/7210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - accuracy: 0.6971 - loss: 0.5766 - val_accuracy: 0.7273 - val_loss: 0.5533
Epoch 7/

In [10]:
# Combine features, actual, and predictions into one table
results_full = x_test.copy()
results_full['Actual'] = y_test.values
results_full['RandomForest_Pred'] = predictions_rf
results_full['XGBoost_Pred'] = predictions_xgb_decoded

results_long_full = x_long_test.copy()
results_long_full['Actual'] = y_long_test.values
results_long_full['NeuralNet_Pred'] = predictions_neural_decoded

# Save the full table to CSV
results_full.to_csv("model_predictions_full.csv", index=False)

print("✅ CSV file saved as 'model_predictions_full.csv'")


✅ CSV file saved as 'model_predictions_full.csv'


In [13]:
from collections import Counter

def majority_vote(row):
    preds = [row['RandomForest_Pred'], row['XGBoost_Pred'], row['NeuralNet_Pred']]
    most_common = Counter(preds).most_common(1)[0][0]
    return most_common

# results_full['Voted_Pred'] = results_full.apply(majority_vote, axis=1)

from sklearn.metrics import accuracy_score

acc_rf = accuracy_score(results_full['Actual'], results_full['RandomForest_Pred'])
acc_xgb = accuracy_score(results_full['Actual'], results_full['XGBoost_Pred'])
acc_nn = accuracy_score(results_long_full['Actual'], results_long_full['NeuralNet_Pred'])
# acc_vote = accuracy_score(results_full['Actual'], results_full['Voted_Pred'])

print(f"RandomForest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy:      {acc_xgb:.4f}")
print(f"NeuralNet Accuracy:    {acc_nn:.4f}")
# print(f"Voting Accuracy:       {acc_vote:.4f}")

# Agreement among models
results_full['Agreement'] = (
    (results_full['RandomForest_Pred'] == results_full['XGBoost_Pred']) &
    (results_full['XGBoost_Pred'] == results_full['NeuralNet_Pred'])
)

agreement_rate = results_full['Agreement'].mean()
print(f"Models fully agree on {agreement_rate*100:.2f}% of cases")

# Compare per-model correctness
results_full['RF_Correct'] = results_full['RandomForest_Pred'] == results_full['Actual']
results_full['XGB_Correct'] = results_full['XGBoost_Pred'] == results_full['Actual']
results_full['NN_Correct'] = results_full['NeuralNet_Pred'] == results_full['Actual']
results_full['Vote_Correct'] = results_full['Voted_Pred'] == results_full['Actual']


# results_full.to_csv("model_voting_analysis.csv", index=False)
# print("✅ Saved as 'model_voting_analysis.csv'")

pred_matrix = pd.DataFrame({
    'RF': results_full['RandomForest_Pred'],
    'XGB': results_full['XGBoost_Pred'],
    'NN': results_long_full['NeuralNet_Pred']
})
print("\nPrediction Correlation Matrix:")
print(pd.crosstab(pred_matrix['RF'], pred_matrix['XGB'], normalize='index'))



RandomForest Accuracy: 0.6289
XGBoost Accuracy:      0.6495
NeuralNet Accuracy:    0.7064


KeyError: 'NeuralNet_Pred'