# Feature Engineering for Trading

This notebook demonstrates the feature engineering process for our trading model, including:
- Adding technical indicators
- Creating price and volume features
- Generating target variables
- Analyzing feature correlations

In [None]:
# Add parent directory to path to import from src
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from src modules
from src.data.loader import load_data, preprocess_data
from src.data.features import (add_technical_indicators, add_price_features, 
                              add_volume_features, generate_target,
                              prepare_features)
from src.utils.helpers import set_pandas_display_options

# Set display options
set_pandas_display_options()

# Matplotlib settings
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = [12, 6]
%matplotlib inline

## 1. Load and Preprocess Data

In [None]:
# Load 15-minute data
file_path = '../USATECH.IDXUSD_Candlestick_15_M_BID_01.01.2023-18.01.2025.csv'
df_raw = load_data(file_path)
df = preprocess_data(df_raw)

print(f"Loaded {len(df)} rows from {df.index.min()} to {df.index.max()}")
df.head()

## 2. Add Technical Indicators

Let's add common technical indicators to our data.

In [None]:
# Add technical indicators
df_indicators = add_technical_indicators(df)

# Display the new columns
new_columns = [col for col in df_indicators.columns if col not in df.columns]
print(f"Added {len(new_columns)} new technical indicators:")
print(new_columns)

# Show first few rows with indicators
df_indicators[new_columns].head()

### Visualize Technical Indicators

Let's visualize some of the technical indicators to understand their behavior.

In [None]:
# Select a recent time period for visualization
recent_data = df_indicators.iloc[-200:]

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Price and Moving Averages
axes[0, 0].plot(recent_data.index, recent_data['Close'], label='Close')
axes[0, 0].plot(recent_data.index, recent_data['SMA20'], label='SMA20')
axes[0, 0].plot(recent_data.index, recent_data['SMA50'], label='SMA50')
axes[0, 0].plot(recent_data.index, recent_data['SMA200'], label='SMA200')
axes[0, 0].set_title('Price and Moving Averages')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Price')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Plot 2: Bollinger Bands
axes[0, 1].plot(recent_data.index, recent_data['Close'], label='Close')
axes[0, 1].plot(recent_data.index, recent_data['BB_Upper'], label='Upper BB')
axes[0, 1].plot(recent_data.index, recent_data['BB_Middle'], label='Middle BB')
axes[0, 1].plot(recent_data.index, recent_data['BB_Lower'], label='Lower BB')
axes[0, 1].fill_between(recent_data.index, 
                       recent_data['BB_Upper'], 
                       recent_data['BB_Lower'], 
                       alpha=0.2, color='gray')
axes[0, 1].set_title('Bollinger Bands')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Price')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Plot 3: RSI
axes[1, 0].plot(recent_data.index, recent_data['RSI'], label='RSI')
axes[1, 0].axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought (70)')
axes[1, 0].axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold (30)')
axes[1, 0].set_title('Relative Strength Index (RSI)')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('RSI')
axes[1, 0].set_ylim(0, 100)
axes[1, 0].legend()
axes[1, 0].grid(True)

# Plot 4: MACD
axes[1, 1].plot(recent_data.index, recent_data['MACD'], label='MACD')
axes[1, 1].plot(recent_data.index, recent_data['MACD_Signal'], label='Signal Line')
axes[1, 1].bar(recent_data.index, recent_data['MACD_Hist'], 
              color=['green' if x > 0 else 'red' for x in recent_data['MACD_Hist']], 
              alpha=0.5, label='Histogram')
axes[1, 1].set_title('MACD')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Value')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

## 3. Add Price and Volume Features

Now let's add additional price-based and volume-based features.

In [None]:
# Add price features
df_price = add_price_features(df_indicators)

# Display the new columns
new_price_columns = [col for col in df_price.columns if col not in df_indicators.columns]
print(f"Added {len(new_price_columns)} new price features:")
print(new_price_columns)

# Show first few rows with price features
df_price[new_price_columns].head()

In [None]:
# Add volume features
df_volume = add_volume_features(df_price)

# Display the new columns
new_volume_columns = [col for col in df_volume.columns if col not in df_price.columns]
print(f"Added {len(new_volume_columns)} new volume features:")
print(new_volume_columns)

# Show first few rows with volume features
df_volume[new_volume_columns].head()

## 4. Generate Target Variable

Now let's create a target variable based on future price movements.

In [None]:
# Generate target variable
future_periods = 10  # Look ahead 10 periods
profit_target = 0.01  # 1% profit target
stop_loss = 0.005  # 0.5% stop loss

df_target = generate_target(df_volume, future_periods, profit_target, stop_loss)

# Check target distribution
target_counts = df_target['Target'].value_counts()
print("Target Distribution:")
print(target_counts)
print(f"Percentage of long signals: {target_counts[1] / len(df_target) * 100:.2f}%")
print(f"Percentage of short signals: {target_counts[-1] / len(df_target) * 100:.2f}%")
print(f"Percentage of neutral: {target_counts[0] / len(df_target) * 100:.2f}%")

### Visualize Target Distribution

In [None]:
# Plot target distribution
plt.figure(figsize=(10, 6))
bars = plt.bar(['Short (-1)', 'Neutral (0)', 'Long (1)'], 
              [target_counts.get(-1, 0), target_counts.get(0, 0), target_counts.get(1, 0)],
              color=['red', 'gray', 'green'])

# Add percentage labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{height/len(df_target)*100:.1f}%',
            ha='center', va='bottom')

plt.title('Target Variable Distribution')
plt.ylabel('Count')
plt.grid(True, alpha=0.3)
plt.show()

## 5. Feature Correlation Analysis

Let's analyze which features are most correlated with our target variable.

In [None]:
# Calculate correlation with target
target_corr = df_target.corr()['Target'].sort_values(ascending=False)

# Print top positive correlations
print("Top Positive Correlations with Target:")
print(target_corr.head(15))

# Print top negative correlations
print("\nTop Negative Correlations with Target:")
print(target_corr.tail(15))

In [None]:
# Plot top correlations
plt.figure(figsize=(12, 8))
top_features = pd.concat([target_corr.head(10), target_corr.tail(10)])
top_features = top_features[top_features.index != 'Target']  # Remove target itself
colors = ['green' if c > 0 else 'red' for c in top_features]
plt.barh(top_features.index, top_features, color=colors)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
plt.title('Top Correlated Features with Target')
plt.xlabel('Correlation Coefficient')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Feature Preparation Pipeline

Now let's use our complete feature preparation pipeline to get a clean dataset ready for modeling.

In [None]:
# Prepare all features in one go
df_features = prepare_features(
    df, 
    include_target=True, 
    future_periods=future_periods, 
    profit_target=profit_target, 
    stop_loss=stop_loss
)

print(f"Final dataset shape: {df_features.shape}")
print(f"Number of features: {df_features.shape[1] - 1}")  # Excluding target
print(f"Number of samples: {df_features.shape[0]}")
print(f"\nFirst few rows:")
df_features.head()

## 7. Feature Selection Analysis

Let's perform basic feature selection to understand which features might be most predictive.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Define features and target
X = df_features.drop(['Target'], axis=1)
y = df_features['Target']

# Replace any remaining NaN or infinite values
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.mean())

# Train a Random Forest for feature importance
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print top 20 features
print("Top 20 features by importance:")
for i in range(20):
    print(f"{i+1}. {X.columns[indices[i]]}: {importances[indices[i]]:.4f}")

In [None]:
# Plot feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances from Random Forest")
plt.bar(range(20), importances[indices[:20]], align="center")
plt.xticks(range(20), X.columns[indices[:20]], rotation=90)
plt.tight_layout()
plt.show()

## 8. Save Processed Dataset

Finally, let's save our processed dataset for use in later notebooks.

In [None]:
# Save processed dataset
df_features.to_csv('../processed_data.csv')
print(f"Saved processed dataset with {df_features.shape[1]} columns and {df_features.shape[0]} rows to '../processed_data.csv'")

## Summary

In this notebook, we've completed the following steps:

1. Loaded and preprocessed the USATECH index data
2. Added technical indicators like moving averages, RSI, MACD, and Bollinger Bands
3. Created price and volume-based features
4. Generated a target variable based on future price movements
5. Analyzed feature correlations with the target
6. Used our complete feature preparation pipeline
7. Performed feature importance analysis
8. Saved the processed dataset for use in model training

Next, we'll use this processed dataset to train our trading model in the next notebook.