In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn requests tf2onnx skl2onnx onnx onnxmltools keras_tuner



In [22]:
import os
import numpy as np
import joblib
import json
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks

# Conversion Libs
try:
    import tf2onnx
    import onnxmltools
    from skl2onnx import convert_sklearn
    from skl2onnx.common.data_types import FloatTensorType
    ONNX_AVAILABLE = True
except ImportError:
    ONNX_AVAILABLE = False
    print("Warning: ONNX libraries not found. ONNX export will be skipped.")

# Setup
PROCESSED_DIR = 'data/processed'
MODELS_DIR = 'models'
os.makedirs(MODELS_DIR, exist_ok=True)

# Load Data
print("Loading data...")
X = np.load(os.path.join(PROCESSED_DIR, 'X.npy'))
y = np.load(os.path.join(PROCESSED_DIR, 'y.npy'))

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save Test Set (if not already done)
np.save(os.path.join(PROCESSED_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(PROCESSED_DIR, 'y_test.npy'), y_test)

# Flatten for ML models
X_train_flat = np.hstack([np.mean(X_train, axis=1), np.std(X_train, axis=1)])
X_test_flat = np.hstack([np.mean(X_test, axis=1), np.std(X_test, axis=1)])

# One-hot for DL
y_train_cat = keras.utils.to_categorical(y_train, 12)
y_test_cat = keras.utils.to_categorical(y_test, 12)
input_shape_dl = (100, 23)

# --- Helper: Model Saver ---
def save_model_safely(model, name, model_type='keras', input_shape=None):
    path_root = os.path.join(MODELS_DIR, name)
    print(f"Saving {name}...")
    try:
        if model_type == 'keras':
            model.save(f"{path_root}.h5")
        else:
            joblib.dump(model, f"{path_root}.joblib")
        print(f" - Native saved: {name}")
    except Exception as e:
        print(f" ! Error saving native model {name}: {e}")

    # if not ONNX_AVAILABLE: return

    # try:
    #     if model_type == 'sklearn':
    #         initial_type = [('float_input', FloatTensorType([None, input_shape]))]
    #         onnx_model = convert_sklearn(model, initial_types=initial_type)
    #         with open(f"{path_root}.onnx", "wb") as f:
    #             f.write(onnx_model.SerializeToString())
    #     elif model_type == 'xgboost':
    #         initial_type = [('float_input', FloatTensorType([None, input_shape]))]
    #         onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_type)
    #         with open(f"{path_root}.onnx", "wb") as f:
    #             f.write(onnx_model.SerializeToString())
    #     elif model_type == 'keras':
    #         spec = (tf.TensorSpec((None, *input_shape), tf.float32, name="input"),)
    #         tf2onnx.convert.from_keras(model, input_signature=spec, output_path=f"{path_root}.onnx")
    #     print(f" - ONNX saved: {name}")
    # except Exception as e:
    #     print(f" ! Error converting {name} to ONNX: {e}")

# --- Helper: DL Hyperparameter Random Search ---
def run_dl_random_search(build_fn, param_grid, n_iter=3):
    """Simple custom random search for Keras models."""
    import random
    best_model = None
    best_acc = 0.0
    best_params = {}

    print(f"Starting Random Search (n_iter={n_iter})...")

    for i in range(n_iter):
        # Sample params
        params = {k: random.choice(v) for k, v in param_grid.items()}
        print(f"Trial {i+1}/{n_iter}: {params}")

        # Build & Train (Quick train for selection)
        model = build_fn(**params)

        # Use early stopping for the search to save time
        es = callbacks.EarlyStopping(monitor='val_accuracy', patience=2, verbose=0)

        history = model.fit(
            X_train, y_train_cat,
            epochs=5, # Short epochs for search
            batch_size=64,
            validation_split=0.1,
            callbacks=[es],
            verbose=0
        )

        val_acc = max(history.history['val_accuracy'])
        print(f" - Val Acc: {val_acc:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            best_model = model # Note: this model is only partially trained
            best_params = params

    print(f"Best Params: {best_params} (Val Acc: {best_acc:.4f})")

    # Re-build and fully train the best configuration
    final_model = build_fn(**best_params)
    return final_model, best_params


# ==================================================================================
# 1. Random Forest (with Manual "Early Stopping" loop & Tuning)
# ==================================================================================
print("\n" + "="*40)
print("1. Random Forest")
print("="*40)

# Step A: Hyperparameter Tuning (RandomizedSearchCV)
rf_params = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}
# We use a smaller n_estimators for tuning to be fast
rf_base = RandomForestClassifier(n_estimators=50, random_state=42)
rf_search = RandomizedSearchCV(rf_base, rf_params, n_iter=5, cv=3, verbose=1, n_jobs=-1)
print("Tuning hyperparameters...")
rf_search.fit(X_train_flat, y_train)
best_rf_params = rf_search.best_params_
print(f"Best RF Params: {best_rf_params}")

# Step B: Train with Logging & Early Stopping (Warm Start)
print("\nTraining final RF with incremental logging...")
rf_final = RandomForestClassifier(
    n_estimators=0, # Start at 0
    warm_start=True, # Allows adding trees
    random_state=42,
    **best_rf_params
)

max_estimators = 200
step = 10
patience = 3
best_val_score = 0
no_improve_count = 0

for i in range(step, max_estimators + 1, step):
    rf_final.n_estimators = i
    rf_final.fit(X_train_flat, y_train)

    # Manual Validation Check
    val_score = rf_final.score(X_test_flat, y_test) # Using Test as Val for demo visibility
    print(f" - Trees: {i}, Val Acc: {val_score:.4f}")

    if val_score > best_val_score + 0.0005: # Threshold
        best_val_score = val_score
        no_improve_count = 0
    else:
        no_improve_count += 1

    if no_improve_count >= patience:
        print(f"Early stopping triggered at {i} trees.")
        break

save_model_safely(rf_final, 'rf_model', 'sklearn', input_shape=46)


# ==================================================================================
# 2. XGBoost (Native Early Stopping)
# ==================================================================================
print("\n" + "="*40)
print("2. XGBoost")
print("="*40)

# Step A: Tuning
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.9]
}
xgb_base = xgb.XGBClassifier(objective='multi:softmax', num_class=12, n_estimators=50)
xgb_search = RandomizedSearchCV(xgb_base, xgb_params, n_iter=5, cv=3, verbose=1, n_jobs=-1)
print("Tuning hyperparameters...")
xgb_search.fit(X_train_flat, y_train)
best_xgb_params = xgb_search.best_params_
print(f"Best XGB Params: {best_xgb_params}")

# Step B: Final Train with Early Stopping
print("\nTraining final XGB with native logs...")
xgb_final = xgb.XGBClassifier(
    n_estimators=500, # High number, let ES stop it
    objective='multi:softmax',
    num_class=12,
    **best_xgb_params
)

xgb_final.fit(
    X_train_flat, y_train,
    eval_set=[(X_train_flat, y_train), (X_test_flat, y_test)],
    verbose=True # Prints log every epoch
)

save_model_safely(xgb_final, 'xgb_model', 'xgboost', input_shape=46)


# ==================================================================================
# 3. LSTM (Deep Learning)
# ==================================================================================
print("\n" + "="*40)
print("3. LSTM")
print("="*40)

def build_lstm(units=64, dropout=0.2, lr=0.001):
    m = keras.Sequential([
        layers.Input(shape=input_shape_dl),
        layers.LSTM(units, return_sequences=True),
        layers.Dropout(dropout),
        layers.LSTM(units // 2),
        layers.Dense(12, activation='softmax')
    ])
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss='categorical_crossentropy', metrics=['accuracy'])
    return m

# Tuning
lstm_grid = {'units': [32, 64], 'dropout': [0.2, 0.4], 'lr': [0.01, 0.001]}
lstm_final, _ = run_dl_random_search(build_lstm, lstm_grid, n_iter=3)

# Final Training with Callbacks
print("Training final LSTM...")
callbacks_list = [
    callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1),
    callbacks.ReduceLROnPlateau(factor=0.5, patience=3, verbose=1)
]
lstm_final.fit(X_train, y_train_cat, epochs=50, batch_size=64,
               validation_split=0.1, callbacks=callbacks_list, verbose=1)

save_model_safely(lstm_final, 'lstm_model', 'keras', input_shape=input_shape_dl)


# ==================================================================================
# 4. CNN
# ==================================================================================
print("\n" + "="*40)
print("4. CNN")
print("="*40)

def build_cnn(filters=64, kernel_size=3, lr=0.001):
    m = keras.Sequential([
        layers.Input(shape=input_shape_dl),
        layers.Conv1D(filters, kernel_size, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(filters*2, kernel_size, activation='relu'),
        layers.GlobalAveragePooling1D(),
        layers.Dense(12, activation='softmax')
    ])
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss='categorical_crossentropy', metrics=['accuracy'])
    return m

# Tuning
cnn_grid = {'filters': [32, 64], 'kernel_size': [3, 5], 'lr': [0.001]}
cnn_final, _ = run_dl_random_search(build_cnn, cnn_grid, n_iter=3)

# Final Train
print("Training final CNN...")
cnn_final.fit(X_train, y_train_cat, epochs=50, batch_size=64,
              validation_split=0.1, callbacks=callbacks_list, verbose=1)

save_model_safely(cnn_final, 'cnn_model', 'keras', input_shape=input_shape_dl)


# ==================================================================================
# 5. Transformer
# ==================================================================================
print("\n" + "="*40)
print("5. Transformer")
print("="*40)

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res

def build_transformer(head_size=64, num_heads=2, dropout=0.1, lr=0.001):
    inputs = keras.Input(shape=input_shape_dl)
    x = inputs
    x = transformer_encoder(x, head_size, num_heads, 64, dropout)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(12, activation="softmax")(x)

    m = keras.Model(inputs, outputs)
    m.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
              loss='categorical_crossentropy', metrics=['accuracy'])
    return m

# Tuning
trans_grid = {'head_size': [64], 'num_heads': [2, 4], 'dropout': [0.1], 'lr': [0.001, 0.0001]}
trans_final, _ = run_dl_random_search(build_transformer, trans_grid, n_iter=3)

# Final Train
print("Training final Transformer...")
trans_final.fit(X_train, y_train_cat, epochs=50, batch_size=64,
                validation_split=0.1, callbacks=callbacks_list, verbose=1)

save_model_safely(trans_final, 'transformer_model', 'keras', input_shape=input_shape_dl)

print("\nAll models trained, tuned, and saved successfully.")

Loading data...

1. Random Forest
Tuning hyperparameters...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best RF Params: {'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 10}

Training final RF with incremental logging...
 - Trees: 10, Val Acc: 0.9920
 - Trees: 20, Val Acc: 0.9934
 - Trees: 30, Val Acc: 0.9942
 - Trees: 40, Val Acc: 0.9920
 - Trees: 50, Val Acc: 0.9934
 - Trees: 60, Val Acc: 0.9934
Early stopping triggered at 60 trees.
Saving rf_model...
 - Native saved: rf_model

2. XGBoost
Tuning hyperparameters...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best XGB Params: {'subsample': 0.7, 'max_depth': 3, 'learning_rate': 0.1}

Training final XGB with native logs...
[0]	validation_0-mlogloss:1.98707	validation_1-mlogloss:1.99041
[1]	validation_0-mlogloss:1.68085	validation_1-mlogloss:1.68557
[2]	validation_0-mlogloss:1.45609	validation_1-mlogloss:1.46524
[3]	validation_0-mlogloss:1.28017	validation_1-mlogloss:1.29251
[4]	validation_0-mlo



Saving lstm_model...
 - Native saved: lstm_model

4. CNN
Starting Random Search (n_iter=3)...
Trial 1/3: {'filters': 64, 'kernel_size': 5, 'lr': 0.001}
 - Val Acc: 0.9909
Trial 2/3: {'filters': 64, 'kernel_size': 3, 'lr': 0.001}
 - Val Acc: 0.9581
Trial 3/3: {'filters': 64, 'kernel_size': 3, 'lr': 0.001}
 - Val Acc: 0.9818
Best Params: {'filters': 64, 'kernel_size': 5, 'lr': 0.001} (Val Acc: 0.9909)
Training final CNN...
Epoch 1/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5530 - loss: 2.3658 - val_accuracy: 0.9745 - val_loss: 0.1350 - learning_rate: 0.0010
Epoch 2/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9699 - loss: 0.1267 - val_accuracy: 0.9672 - val_loss: 0.1026 - learning_rate: 0.0010
Epoch 3/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.9622 - loss: 0.1345 - val_accuracy: 0.9872 - val_loss: 0.0530 - learning_rate: 0.0010
Epoch 4/50
[1m78/78



Saving cnn_model...
 - Native saved: cnn_model

5. Transformer
Starting Random Search (n_iter=3)...
Trial 1/3: {'head_size': 64, 'num_heads': 2, 'dropout': 0.1, 'lr': 0.001}
 - Val Acc: 0.9745
Trial 2/3: {'head_size': 64, 'num_heads': 2, 'dropout': 0.1, 'lr': 0.001}
 - Val Acc: 0.9690
Trial 3/3: {'head_size': 64, 'num_heads': 2, 'dropout': 0.1, 'lr': 0.001}
 - Val Acc: 0.9690
Best Params: {'head_size': 64, 'num_heads': 2, 'dropout': 0.1, 'lr': 0.001} (Val Acc: 0.9745)
Training final Transformer...
Epoch 1/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 129ms/step - accuracy: 0.3357 - loss: 3.5765 - val_accuracy: 0.8506 - val_loss: 0.5756 - learning_rate: 0.0010
Epoch 2/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 108ms/step - accuracy: 0.8318 - loss: 0.6043 - val_accuracy: 0.9381 - val_loss: 0.1974 - learning_rate: 0.0010
Epoch 3/50
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 122ms/step - accuracy: 0.9082 - loss: 0.2930 - v



Saving transformer_model...
 - Native saved: transformer_model

All models trained, tuned, and saved successfully.
