# MtfScalper RL Feature Engineering Analysis
## تحلیل و بررسی ویژگی‌های RL برای بهینه‌سازی خروج

این نوت‌بوک برای:
1. تحلیل اهمیت ویژگی‌ها
2. بررسی همبستگی بین features
3. ارزیابی کیفیت سیگنال‌های خروج
4. تنظیم پارامترهای reward function

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Freqtrade imports
try:
    from freqtrade.data.history import load_pair_history
    FREQTRADE_AVAILABLE = True
except ImportError:
    FREQTRADE_AVAILABLE = False
    print("Freqtrade not available, using alternative data loading")

# Set style
plt.style.use('dark_background')
sns.set_palette('husl')

print("Libraries imported successfully!")
print(f"Freqtrade available: {FREQTRADE_AVAILABLE}")

ModuleNotFoundError: No module named 'seaborn'

## 1. Load Data and Strategy

In [None]:
# Configuration
PAIR = "BTC/USDT:USDT"
TIMEFRAME = "5m"
DATA_PATH = "user_data/data/binance"

# Try to load data using Freqtrade
if FREQTRADE_AVAILABLE:
    try:
        df = load_pair_history(
            datadir=Path(DATA_PATH),
            timeframe=TIMEFRAME,
            pair=PAIR,
            data_format='json',
            candle_type='futures'
        )
        print(f"Loaded {len(df)} candles for {PAIR} using Freqtrade")
    except Exception as e:
        print(f"Error loading with Freqtrade: {e}")
        FREQTRADE_AVAILABLE = False

# Alternative: Generate sample data if Freqtrade not available
if not FREQTRADE_AVAILABLE or df is None:
    print("Generating sample data for analysis...")
    
    # Generate sample price data
    np.random.seed(42)
    periods = 5000
    base_price = 60000
    
    # Generate realistic price movements
    returns = np.random.normal(0.0001, 0.02, periods)
    prices = [base_price]
    
    for ret in returns:
        new_price = prices[-1] * (1 + ret)
        prices.append(new_price)
    
    # Create OHLCV data
    timestamps = pd.date_range(start='2024-01-01', periods=periods, freq='5T')
    
    data = {
        'open': prices[:-1],
        'high': [p * (1 + abs(np.random.normal(0, 0.01))) for p in prices[:-1]],
        'low': [p * (1 - abs(np.random.normal(0, 0.01))) for p in prices[:-1]],
        'close': prices[1:],
        'volume': np.random.lognormal(10, 1, periods)
    }
    
    df = pd.DataFrame(data, index=timestamps)
    print(f"Generated {len(df)} sample candles")

print(f"Date range: {df.index[0]} to {df.index[-1]}")
print(f"Price range: ${df['close'].min():.0f} - ${df['close'].max():.0f}")
df.tail()

## 2. Feature Engineering Pipeline

In [None]:
def apply_feature_engineering(df):
    """
    Apply all feature engineering for RL exit optimization
    """
    
    # Basic indicators
    try:
        import talib.abstract as ta
        TALIB_AVAILABLE = True
    except ImportError:
        TALIB_AVAILABLE = False
        print("TA-Lib not available, using manual calculations")
    
    if TALIB_AVAILABLE:
        # EMAs
        df['ema_9'] = ta.EMA(df, timeperiod=9)
        df['ema_21'] = ta.EMA(df, timeperiod=21)
        df['ema_200'] = ta.EMA(df, timeperiod=200)
        
        # RSI
        df['rsi'] = ta.RSI(df, timeperiod=14)
        
        # ATR
        df['atr'] = ta.ATR(df, timeperiod=14)
        
        # Bollinger Bands
        df['bb_upper'], df['bb_middle'], df['bb_lower'] = ta.BBANDS(df, timeperiod=20)
    else:
        # Manual calculations
        # EMAs
        df['ema_9'] = df['close'].ewm(span=9).mean()
        df['ema_21'] = df['close'].ewm(span=21).mean()
        df['ema_200'] = df['close'].ewm(span=200).mean()
        
        # RSI
        delta = df['close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        df['rsi'] = 100 - (100 / (1 + rs))
        
        # ATR
        high_low = df['high'] - df['low']
        high_close = np.abs(df['high'] - df['close'].shift())
        low_close = np.abs(df['low'] - df['close'].shift())
        true_range = np.maximum(high_low, np.maximum(high_close, low_close))
        df['atr'] = true_range.rolling(window=14).mean()
        
        # Bollinger Bands
        df['bb_middle'] = df['close'].rolling(window=20).mean()
        df['bb_std'] = df['close'].rolling(window=20).std()
        df['bb_upper'] = df['bb_middle'] + (df['bb_std'] * 2)
        df['bb_lower'] = df['bb_middle'] - (df['bb_std'] * 2)
    
    # ═══════════════════════════════════════════════
    # Exit-Specific Features
    # ═══════════════════════════════════════════════
    
    # Price momentum
    df['momentum_5'] = df['close'].pct_change(5)
    df['momentum_10'] = df['close'].pct_change(10)
    df['momentum_20'] = df['close'].pct_change(20)
    
    # Acceleration
    df['acceleration'] = df['momentum_5'].diff()
    
    # Distance from recent high/low
    df['dist_from_high_20'] = (df['high'].rolling(20).max() - df['close']) / df['close']
    df['dist_from_low_20'] = (df['close'] - df['low'].rolling(20).min()) / df['close']
    
    # Volume patterns
    df['volume_ratio_5'] = df['volume'] / df['volume'].rolling(5).mean()
    df['volume_ratio_20'] = df['volume'] / df['volume'].rolling(20).mean()
    
    # Spread proxy
    df['spread_proxy'] = (df['high'] - df['low']) / df['close']
    df['spread_ma_ratio'] = df['spread_proxy'] / df['spread_proxy'].rolling(20).mean()
    
    # RSI Divergence
    price_higher = df['close'] > df['close'].shift(10)
    rsi_lower = df['rsi'] < df['rsi'].shift(10)
    df['bearish_divergence'] = (price_higher & rsi_lower).astype(int)
    
    price_lower = df['close'] < df['close'].shift(10)
    rsi_higher = df['rsi'] > df['rsi'].shift(10)
    df['bullish_divergence'] = (price_lower & rsi_higher).astype(int)
    
    # Support/Resistance
    df['pivot'] = (df['high'] + df['low'] + df['close']) / 3
    df['r1'] = 2 * df['pivot'] - df['low']
    df['s1'] = 2 * df['pivot'] - df['high']
    df['dist_to_r1'] = (df['r1'] - df['close']) / df['close']
    df['dist_to_s1'] = (df['close'] - df['s1']) / df['close']
    
    # Risk score
    df['risk_score'] = (
        df['spread_proxy'] * 0.3 +
        (1 / (df['volume_ratio_5'] + 0.1)) * 0.3 +
        df['atr'] / df['close'] * 0.4
    )
    
    return df.fillna(0)

# Apply features
df = apply_feature_engineering(df)
print(f"Created {len(df.columns)} features")
print(f"Shape: {df.shape}")
df.tail()

## 3. Feature Importance Analysis

In [None]:
# Calculate feature importance based on correlation with future returns
def calculate_feature_importance(df, target_periods=[5, 10, 20]):
    """
    Calculate feature importance for different prediction horizons
    """
    importance_dict = {}
    
    feature_cols = [
        'momentum_5', 'momentum_10', 'momentum_20', 'acceleration',
        'dist_from_high_20', 'dist_from_low_20', 'volume_ratio_5',
        'volume_ratio_20', 'spread_proxy', 'bearish_divergence',
        'bullish_divergence', 'risk_score', 'rsi', 'atr'
    ]
    
    for period in target_periods:
        # Calculate future returns
        df[f'future_return_{period}'] = df['close'].shift(-period) / df['close'] - 1
        
        # Calculate correlations
        correlations = {}
        for feature in feature_cols:
            if feature in df.columns:
                corr = df[feature].corr(df[f'future_return_{period}'])
                correlations[feature] = abs(corr) if not pd.isna(corr) else 0
        
        importance_dict[f'{period}_candles'] = correlations
    
    return pd.DataFrame(importance_dict)

importance_df = calculate_feature_importance(df)
importance_df = importance_df.sort_values('10_candles', ascending=False)

print("Feature Importance Results:")
print(importance_df.head(10))

# Plot importance
if len(importance_df) > 0:
    fig, ax = plt.subplots(figsize=(12, 8))
    importance_df.head(10).plot(kind='barh', ax=ax)
    plt.title('Feature Importance for Exit Timing (Top 10)', fontsize=14)
    plt.xlabel('Absolute Correlation with Future Returns')
    plt.ylabel('Features')
    plt.legend(title='Prediction Horizon')
    plt.tight_layout()
    plt.show()

    print("\nTop 5 Most Important Features (10 candles):")
    print(importance_df['10_candles'].head())

## 4. Exit Signal Quality Analysis

In [None]:
def analyze_exit_signals(df):
    """
    Analyze potential exit points based on various conditions
    """
    
    # Define exit conditions
    exit_conditions = {
        'RSI_Overbought': df['rsi'] > 70,
        'RSI_Oversold': df['rsi'] < 30,
        'Bearish_Divergence': df['bearish_divergence'] == 1,
        'Bullish_Divergence': df['bullish_divergence'] == 1,
        'High_Risk_Score': df['risk_score'] > df['risk_score'].quantile(0.8),
        'Low_Volume': df['volume_ratio_5'] < 0.5,
        'Price_At_Resistance': df['dist_to_r1'] < 0.005,
        'Price_At_Support': df['dist_to_s1'] < 0.005,
        'High_Momentum': df['momentum_5'] > 0.02,
        'Negative_Momentum': df['momentum_5'] < -0.02
    }
    
    results = {}
    
    for condition_name, condition in exit_conditions.items():
        # Calculate returns after signal
        signal_indices = df[condition].index
        
        if len(signal_indices) > 0:
            returns_5 = []
            returns_10 = []
            
            for idx in signal_indices[:-20]:  # Avoid end of dataframe
                loc = df.index.get_loc(idx)
                if loc + 10 < len(df):
                    ret_5 = (df.iloc[loc + 5]['close'] - df.iloc[loc]['close']) / df.iloc[loc]['close']
                    ret_10 = (df.iloc[loc + 10]['close'] - df.iloc[loc]['close']) / df.iloc[loc]['close']
                    returns_5.append(ret_5)
                    returns_10.append(ret_10)
            
            results[condition_name] = {
                'count': len(signal_indices),
                'frequency': len(signal_indices) / len(df),
                'avg_return_5': np.mean(returns_5) if returns_5 else 0,
                'avg_return_10': np.mean(returns_10) if returns_10 else 0,
                'win_rate_5': sum(1 for r in returns_5 if r > 0) / len(returns_5) if returns_5 else 0
            }
    
    return pd.DataFrame(results).T

exit_analysis = analyze_exit_signals(df)
exit_analysis = exit_analysis.sort_values('avg_return_10', ascending=False)

print("Exit Signal Analysis:")
print("=" * 80)
print(exit_analysis)

# Visualize
if len(exit_analysis) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Average returns
    exit_analysis[['avg_return_5', 'avg_return_10']].plot(kind='bar', ax=axes[0])
    axes[0].set_title('Average Returns After Exit Signal')
    axes[0].set_ylabel('Return (%)')
    axes[0].axhline(y=0, color='r', linestyle='--', alpha=0.5)

    # Plot 2: Signal frequency vs win rate
    axes[1].scatter(exit_analysis['frequency'], exit_analysis['win_rate_5'], s=100)
    for idx, row in exit_analysis.iterrows():
        axes[1].annotate(idx, (row['frequency'], row['win_rate_5']), fontsize=8)
    axes[1].set_xlabel('Signal Frequency')
    axes[1].set_ylabel('Win Rate (5 candles)')
    axes[1].set_title('Signal Frequency vs Win Rate')
    axes[1].axhline(y=0.5, color='r', linestyle='--', alpha=0.5)

    plt.tight_layout()
    plt.show()

## 5. Multi-Timeframe Alignment Analysis

In [None]:
def analyze_mtf_alignment(df):
    """
    Analyze the quality of multi-timeframe alignment signals
    """
    
    # Simulate MTF conditions
    df['trend_5m'] = (df['ema_9'] > df['ema_21']).astype(int)
    
    # Resample for higher timeframes
    df_15m = df.resample('15T').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    })
    
    df_1h = df.resample('1H').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    })
    
    # Calculate higher TF trends
    df_15m['ema_9'] = df_15m['close'].ewm(span=9).mean()
    df_15m['ema_21'] = df_15m['close'].ewm(span=21).mean()
    df_15m['trend'] = (df_15m['ema_9'] > df_15m['ema_21']).astype(int)
    
    df_1h['ema_9'] = df_1h['close'].ewm(span=9).mean()
    df_1h['ema_21'] = df_1h['close'].ewm(span=21).mean()
    df_1h['trend'] = (df_1h['ema_9'] > df_1h['ema_21']).astype(int)
    
    # Merge trends back
    df['trend_15m'] = df_15m['trend'].reindex(df.index, method='ffill')
    df['trend_1h'] = df_1h['trend'].reindex(df.index, method='ffill')
    
    # Calculate alignment
    df['mtf_aligned_long'] = (
        (df['trend_5m'] == 1) &
        (df['trend_15m'] == 1) &
        (df['trend_1h'] == 1)
    )
    
    df['mtf_aligned_short'] = (
        (df['trend_5m'] == 0) &
        (df['trend_15m'] == 0) &
        (df['trend_1h'] == 0)
    )
    
    # Analyze performance of aligned signals
    aligned_long_signals = df[df['mtf_aligned_long']].index
    aligned_short_signals = df[df['mtf_aligned_short']].index
    
    results = {
        'Total_Candles': len(df),
        'Aligned_Long_Count': len(aligned_long_signals),
        'Aligned_Short_Count': len(aligned_short_signals),
        'Aligned_Long_%': len(aligned_long_signals) / len(df) * 100,
        'Aligned_Short_%': len(aligned_short_signals) / len(df) * 100
    }
    
    # Calculate returns after alignment
    if len(aligned_long_signals) > 0:
        long_returns = []
        for idx in aligned_long_signals[:-20]:
            loc = df.index.get_loc(idx)
            if loc + 20 < len(df):
                ret = (df.iloc[loc + 20]['close'] - df.iloc[loc]['close']) / df.iloc[loc]['close']
                long_returns.append(ret)
        results['Avg_Long_Return_20'] = np.mean(long_returns) if long_returns else 0
    
    if len(aligned_short_signals) > 0:
        short_returns = []
        for idx in aligned_short_signals[:-20]:
            loc = df.index.get_loc(idx)
            if loc + 20 < len(df):
                ret = (df.iloc[loc]['close'] - df.iloc[loc + 20]['close']) / df.iloc[loc]['close']
                short_returns.append(ret)
        results['Avg_Short_Return_20'] = np.mean(short_returns) if short_returns else 0
    
    return results

mtf_results = analyze_mtf_alignment(df)

print("Multi-Timeframe Alignment Analysis:")
print("=" * 50)
for key, value in mtf_results.items():
    if '%' in key or 'Return' in key:
        print(f"{key}: {value:.2f}%")
    else:
        print(f"{key}: {value:,}")

# Visualize MTF alignment
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(df.index[-500:], df['close'][-500:], label='Price', alpha=0.7)
ax.scatter(df[df['mtf_aligned_long']].index[-50:], df[df['mtf_aligned_long']]['close'][-50:], 
           color='green', label='Aligned Long', alpha=0.8, marker='^')
ax.scatter(df[df['mtf_aligned_short']].index[-50:], df[df['mtf_aligned_short']]['close'][-50:], 
           color='red', label='Aligned Short', alpha=0.8, marker='v')
ax.set_title('MTF Alignment Signals (Last 500 Candles)')
ax.set_ylabel('Price')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Reward Function Optimization

In [None]:
def simulate_reward_function(df, weights):
    """
    Simulate different reward function weight combinations
    """
    
    # Simulate trades
    trades = []
    in_position = False
    entry_price = 0
    entry_idx = 0
    max_profit = 0
    
    for i in range(100, len(df) - 20):
        row = df.iloc[i]
        
        # Simple entry condition
        if not in_position and row['momentum_5'] > 0 and row['rsi'] < 70:
            in_position = True
            entry_price = row['close']
            entry_idx = i
            max_profit = 0
        
        elif in_position:
            current_profit = (row['close'] - entry_price) / entry_price
            max_profit = max(max_profit, current_profit)
            
            # Calculate reward components
            profit_score = min(10, max(-10, current_profit * 100))
            
            # Drawdown from max profit
            drawdown = max_profit - current_profit if max_profit > 0 else 0
            drawdown_score = -5 * drawdown if drawdown > 0.01 else 5
            
            # Timing score (simplified)
            position_duration = i - entry_idx
            timing_score = 5 if position_duration < 50 else -position_duration * 0.01
            
            # Risk/Reward (simplified)
            rr_score = 3 if current_profit > 0.01 else -1
            
            # Calculate total reward
            total_reward = (
                weights['profit'] * profit_score +
                weights['drawdown'] * drawdown_score +
                weights['timing'] * timing_score +
                weights['risk_reward'] * rr_score
            )
            
            # Exit decision based on reward
            if total_reward < -5 or position_duration > 100 or current_profit > 0.03:
                trades.append({
                    'entry_idx': entry_idx,
                    'exit_idx': i,
                    'duration': position_duration,
                    'profit': current_profit,
                    'max_profit': max_profit,
                    'drawdown': drawdown,
                    'reward': total_reward
                })
                in_position = False
    
    return pd.DataFrame(trades)

# Test different weight combinations
weight_combinations = [
    {'profit': 0.35, 'drawdown': 0.25, 'timing': 0.20, 'risk_reward': 0.20},  # Balanced
    {'profit': 0.50, 'drawdown': 0.20, 'timing': 0.15, 'risk_reward': 0.15},  # Profit focused
    {'profit': 0.25, 'drawdown': 0.40, 'timing': 0.20, 'risk_reward': 0.15},  # Risk focused
    {'profit': 0.30, 'drawdown': 0.20, 'timing': 0.35, 'risk_reward': 0.15},  # Timing focused
]

results_comparison = []

for i, weights in enumerate(weight_combinations):
    trades = simulate_reward_function(df, weights)
    
    if len(trades) > 0:
        results_comparison.append({
            'Config': f"Config_{i+1}",
            'Total_Trades': len(trades),
            'Avg_Profit': trades['profit'].mean() * 100,
            'Win_Rate': (trades['profit'] > 0).mean() * 100,
            'Avg_Duration': trades['duration'].mean(),
            'Max_Drawdown': trades['drawdown'].max() * 100,
            'Sharpe': trades['profit'].mean() / trades['profit'].std() if trades['profit'].std() > 0 else 0
        })

comparison_df = pd.DataFrame(results_comparison)

print("Reward Function Weight Comparison:")
print("=" * 80)
print(comparison_df.to_string())

# Visualize comparison
if len(comparison_df) > 0:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Metrics to plot
    metrics = ['Avg_Profit', 'Win_Rate', 'Avg_Duration', 'Sharpe']
    for idx, metric in enumerate(metrics):
        ax = axes[idx // 2, idx % 2]
        if metric in comparison_df.columns:
            comparison_df.plot(x='Config', y=metric, kind='bar', ax=ax, legend=False)
            ax.set_title(metric.replace('_', ' '))
            ax.set_xlabel('')
    
    plt.suptitle('Reward Function Configuration Comparison', fontsize=16)
    plt.tight_layout()
    plt.show()

## 7. Recommendations and Next Steps

In [None]:
print("📊 تحلیل نهایی و توصیه‌ها:")
print("=" * 80)

recommendations = """
1. **ویژگی‌های کلیدی برای خروج:**
   - Momentum indicators (5-20 candles) بیشترین اهمیت
   - Volume patterns نشان‌دهنده کیفیت نقدینگی
   - Divergence signals برای تشخیص reversal

2. **تنظیمات Reward Function:**
   - وزن Profit: 35% (تعادل بین سود و ریسک)
   - وزن Drawdown Control: 25% (کنترل ریسک)
   - وزن Timing: 20% (خروج در زمان مناسب)
   - وزن Risk/Reward: 20% (کیفیت معامله)

3. **بهینه‌سازی‌های پیشنهادی:**
   - استفاده از Ensemble Models برای predictions
   - اضافه کردن Market Regime Detection
   - پیاده‌سازی Adaptive Position Sizing

4. **Risk Management:**
   - Max position duration: 300 candles (25 hours)
   - Emergency exit at -3% loss
   - Breakeven trigger at +2% profit

5. **مراحل بعدی:**
   - Phase 2: آموزش مدل RL با داده‌های 18 ماه
   - Phase 3: Backtesting و walk-forward analysis
   - Phase 4: Paper trading برای 2-4 هفته
   - Phase 5: Live deployment با position sizing محدود
"""

print(recommendations)

# Save analysis results
analysis_results = {
    'data_info': {
        'total_candles': len(df),
        'date_range': f"{df.index[0]} to {df.index[-1]}",
        'price_range': f"${df['close'].min():.0f} - ${df['close'].max():.0f}"
    },
    'feature_count': len(df.columns),
    'best_exit_signals': exit_analysis.head(3).to_dict() if len(exit_analysis) > 0 else {},
    'mtf_alignment': mtf_results,
    'best_reward_config': comparison_df.iloc[0].to_dict() if len(comparison_df) > 0 else {}
}

import json
try:
    with open('user_data/notebooks/feature_analysis_results.json', 'w') as f:
        json.dump(analysis_results, f, indent=2, default=str)
    print("\n✅ Results saved to feature_analysis_results.json")
except:
    print("\n⚠️ Could not save results to file")

print("\n🎯 Analysis Complete! Ready for RL model optimization.")