# Model Training for Trading Entry Points

This notebook demonstrates the process of training and evaluating our trading entry point prediction model. We'll cover:
- Loading the prepared dataset
- Train/test splitting
- Model training
- Evaluation and validation
- Model explanation

In [None]:
# Add parent directory to path to import from src
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, TimeSeriesSplit

# Import from src modules
from src.data.loader import load_data, preprocess_data
from src.data.features import prepare_features
from src.models.random_forest_model import RandomForestModel
from src.visualization.charts import plot_feature_importance
from src.utils.helpers import set_pandas_display_options

# Set display options
set_pandas_display_options()

# Matplotlib settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 6]
%matplotlib inline

## 1. Load Prepared Dataset

Let's load the dataset we prepared in the previous notebook.

In [None]:
# Try to load processed data from file
try:
    df_features = pd.read_csv('../processed_data.csv', index_col=0, parse_dates=True)
    print(f"Loaded processed dataset with {df_features.shape[1]} columns and {df_features.shape[0]} rows")
except FileNotFoundError:
    print("Processed data file not found. Preparing features from raw data...")
    # Load and process raw data
    file_path = '../USATECH.IDXUSD_Candlestick_15_M_BID_01.01.2023-18.01.2025.csv'
    df_raw = load_data(file_path)
    df = preprocess_data(df_raw)
    
    # Set parameters
    future_periods = 10
    profit_target = 0.01
    stop_loss = 0.005
    
    # Prepare features
    df_features = prepare_features(
        df, 
        include_target=True, 
        future_periods=future_periods, 
        profit_target=profit_target, 
        stop_loss=stop_loss
    )
    print(f"Prepared dataset with {df_features.shape[1]} columns and {df_features.shape[0]} rows")

# Display a few rows
df_features.head()

## 2. Train/Test Split

For time series data, we need to be careful about how we split the data. We'll use a time-based split rather than a random split.

In [None]:
# Set parameters
test_size = 0.2  # Last 20% for testing

# Calculate split index
split_idx = int(len(df_features) * (1 - test_size))
df_train = df_features.iloc[:split_idx]
df_test = df_features.iloc[split_idx:]

print(f"Training set: {df_train.shape} from {df_train.index.min()} to {df_train.index.max()}")
print(f"Testing set: {df_test.shape} from {df_test.index.min()} to {df_test.index.max()}")

### Check Target Distribution in Train/Test Sets

In [None]:
# Check target distribution in training set
train_target_counts = df_train['Target'].value_counts()
test_target_counts = df_test['Target'].value_counts()

print("Training Set Target Distribution:")
for target, count in train_target_counts.items():
    print(f"  Target {target}: {count} samples ({count/len(df_train)*100:.2f}%)")

print("\nTest Set Target Distribution:")
for target, count in test_target_counts.items():
    print(f"  Target {target}: {count} samples ({count/len(df_test)*100:.2f}%)")

In [None]:
# Plot target distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Training set
axes[0].bar(['Short (-1)', 'Neutral (0)', 'Long (1)'], 
          [train_target_counts.get(-1, 0), train_target_counts.get(0, 0), train_target_counts.get(1, 0)],
          color=['red', 'gray', 'green'])
axes[0].set_title('Training Set Target Distribution')
axes[0].set_ylabel('Count')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].bar(['Short (-1)', 'Neutral (0)', 'Long (1)'], 
          [test_target_counts.get(-1, 0), test_target_counts.get(0, 0), test_target_counts.get(1, 0)],
          color=['red', 'gray', 'green'])
axes[1].set_title('Test Set Target Distribution')
axes[1].set_ylabel('Count')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Create and Train Model

Now let's create and train our Random Forest model.

In [None]:
# Initialize model
model = RandomForestModel(
    n_estimators=100,
    max_depth=12,
    min_samples_split=10,
    random_state=42,
    profit_target=0.01,
    stop_loss=0.005
)

# Extract features and target
X_train, y_train = model.extract_features_target(df_train)
X_test, y_test = model.extract_features_target(df_test)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Train the model
print("Training model...")
%time model.train(X_train, y_train)
print("Training completed!")

## 4. Model Evaluation

Let's evaluate the model's performance on both training and test data.

In [None]:
# Evaluate on training data
train_metrics = model.evaluate(X_train, y_train)
print("Training set metrics:")
for metric, value in train_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Evaluate on test data
test_metrics = model.evaluate(X_test, y_test)
print("\nTest set metrics:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Get detailed classification report
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
           xticklabels=['Short', 'Neutral', 'Long'] if len(set(y_test)) == 3 else ['Neutral', 'Long'],
           yticklabels=['Short', 'Neutral', 'Long'] if len(set(y_test)) == 3 else ['Neutral', 'Long'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## 5. Feature Importance Analysis

Let's analyze which features were most important in our model.

In [None]:
# Get feature importance
feature_importance = model.get_feature_importance()
print("Top 15 important features:")
feature_importance.head(15)

In [None]:
# Plot feature importance
fig = plot_feature_importance(feature_importance, top_n=15)
plt.tight_layout()
plt.show()

## 6. Cross-Validation with Time Series Split

Let's perform time series cross-validation to get a more robust estimate of our model's performance.

In [None]:
# Extract all features and target
X, y = model.extract_features_target(df_features)

# Set up time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Performance metrics for each fold
cv_scores = []

for train_idx, test_idx in tscv.split(X):
    # Get train/test split for this fold
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]
    
    # Create and train a new model
    fold_model = RandomForestModel(
        n_estimators=100,
        max_depth=12,
        min_samples_split=10,
        random_state=42
    )
    
    # Train the model
    fold_model.train(X_train_fold, y_train_fold)
    
    # Evaluate the model
    metrics = fold_model.evaluate(X_test_fold, y_test_fold)
    cv_scores.append(metrics)
    
    print(f"Fold {len(cv_scores)} - Accuracy: {metrics['accuracy']:.4f}, F1 Score: {metrics['f1_score']:.4f}")

In [None]:
# Calculate average metrics across folds
avg_metrics = {}
for metric in cv_scores[0].keys():
    avg_metrics[metric] = np.mean([fold[metric] for fold in cv_scores])

print("Average metrics across folds:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 7. Prediction Explanation

Let's examine some specific predictions and get explanations for them.

In [None]:
# Get test set predictions
test_pred = model.predict(X_test)
test_proba = model.predict_proba(X_test)

# Find some interesting predictions
# 1. A high confidence correct prediction
high_conf_correct = []
# 2. A low confidence prediction
low_conf = []
# 3. An incorrect prediction
incorrect = []

for i in range(len(test_pred)):
    pred = test_pred[i]
    actual = y_test[i]
    
    # Get prediction probability
    if pred == 0:
        prob = test_proba[i][0]
    elif pred == 1 and len(test_proba[i]) > 1:
        prob = test_proba[i][1]
    elif pred == -1 and len(test_proba[i]) > 2:
        prob = test_proba[i][2]
    else:
        prob = test_proba[i][0]
    
    # Check prediction categories
    if pred == actual and prob > 0.8 and len(high_conf_correct) < 1:
        high_conf_correct.append(i)
    elif 0.5 < prob < 0.6 and len(low_conf) < 1:
        low_conf.append(i)
    elif pred != actual and len(incorrect) < 1:
        incorrect.append(i)
    
    # Break if we found examples for all categories
    if len(high_conf_correct) >= 1 and len(low_conf) >= 1 and len(incorrect) >= 1:
        break

In [None]:
# Get explanations for interesting predictions
if high_conf_correct:
    print("Explanation for high confidence correct prediction:")
    idx = high_conf_correct[0]
    explanation = model.explain_prediction(X_test[idx], test_pred[idx], test_proba[idx])
    print(f"Date: {df_test.index[idx]}")
    print(f"Actual: {y_test[idx]}, Predicted: {test_pred[idx]}, Confidence: {explanation['confidence']:.2%}")
    print(explanation['explanation'])
    print("\n" + "-"*80)

if low_conf:
    print("\nExplanation for low confidence prediction:")
    idx = low_conf[0]
    explanation = model.explain_prediction(X_test[idx], test_pred[idx], test_proba[idx])
    print(f"Date: {df_test.index[idx]}")
    print(f"Actual: {y_test[idx]}, Predicted: {test_pred[idx]}, Confidence: {explanation['confidence']:.2%}")
    print(explanation['explanation'])
    print("\n" + "-"*80)

if incorrect:
    print("\nExplanation for incorrect prediction:")
    idx = incorrect[0]
    explanation = model.explain_prediction(X_test[idx], test_pred[idx], test_proba[idx])
    print(f"Date: {df_test.index[idx]}")
    print(f"Actual: {y_test[idx]}, Predicted: {test_pred[idx]}, Confidence: {explanation['confidence']:.2%}")
    print(explanation['explanation'])

## 8. Save Trained Model

Let's save our trained model for use in backtesting and future predictions.

In [None]:
# Save the model
model_path = model.save('../trained_model.pkl')
print(f"Model saved to: {model_path}")

## Summary

In this notebook, we've accomplished the following:

1. Loaded our preprocessed dataset
2. Split the data into training and test sets using a time-based approach
3. Created and trained a Random Forest model for predicting trading entry points
4. Evaluated the model's performance on both training and test data
5. Analyzed feature importance to understand what drives the model's predictions
6. Performed time series cross-validation for more robust performance estimates
7. Generated explanations for specific predictions
8. Saved our trained model for future use

Next, we'll use our trained model for backtesting to see how it would have performed in historical trading scenarios.