In [1]:
import itertools
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from load_data import load_csv_to_dict
import alphalens as al

def calculate_mom6_parameterized(master_data, lookback_period=126, winsorize_lower=0.01, winsorize_upper=0.99):
    """
    Parameterized version of MOM6 calculation with simplified parameters
    
    Parameters:
    -----------
    master_data : dict
        Dictionary containing market data with 'master_close' DataFrame
    lookback_period : int
        Number of days to look back for momentum calculation
    winsorize_lower : float
        Lower percentile for winsorization (0 to 1)
    winsorize_upper : float
        Upper percentile for winsorization (0 to 1)
        
    Returns:
    --------
    pd.DataFrame
        Factor values for each asset and date
    """
    if not isinstance(master_data, dict) or 'master_close' not in master_data:
        raise ValueError("master_data must be a dictionary containing 'master_close'")
        
    if not (0 <= winsorize_lower < winsorize_upper <= 1):
        raise ValueError("Invalid winsorization bounds")
        
    # Get close prices
    close = master_data['master_close'].copy()
    
    # Calculate daily returns - handle inf values
    daily_returns = close.pct_change().replace([np.inf, -np.inf], np.nan)
    
    # Calculate momentum using rolling window
    def mom6_calculation(window):
        clean_window = window.dropna()
        if len(clean_window) < lookback_period:
            return np.nan
        try:
            cum_return = (1 + clean_window).prod() - 1
            return cum_return if np.isfinite(cum_return) else np.nan
        except Exception:
            return np.nan
    
    # Apply rolling calculation with progress bar
    factor = pd.DataFrame(index=daily_returns.index, columns=daily_returns.columns)
    for col in tqdm(daily_returns.columns, desc="Calculating momentum"):
        factor[col] = daily_returns[col].rolling(
            window=lookback_period,
            min_periods=lookback_period
        ).apply(mom6_calculation)
    
    # Shift by 1 to avoid look-ahead bias
    factor = factor.shift(1)
    
    # Cross-sectional standardization with minimum sample size check
    min_samples = max(10, int(0.1 * factor.shape[1]))  # At least 10 stocks or 10% of universe
    
    for dt in factor.index:
        valid_data = factor.loc[dt].dropna()
        if len(valid_data) >= min_samples:
            mean = valid_data.mean()
            std = valid_data.std()
            if std > 0:
                factor.loc[dt] = (factor.loc[dt] - mean) / std
                
    # Winsorize extreme values
    valid_data = factor.stack().dropna()
    if len(valid_data) > 0:
        lower = valid_data.quantile(winsorize_lower)
        upper = valid_data.quantile(winsorize_upper)
        factor = factor.clip(lower=lower, upper=upper)
    
    # Add basic statistics logging
    stats_summary = factor.describe()
    print("\nFactor Statistics:")
    print(stats_summary)
    
    missing_pct = factor.isnull().mean().mean() * 100
    print(f"\nMissing Values: {missing_pct:.2f}%")
    
    return factor

def evaluate_performance(factor, price_df):
    """
    Performance evaluation using IC and returns spread with enhanced error handling
    
    Parameters:
    -----------
    factor : pd.DataFrame
        Factor values for each asset and date
    price_df : pd.DataFrame
        Price data for assets
        
    Returns:
    --------
    dict
        Performance metrics including IC and Sharpe ratio
    """
    if factor.isnull().all().all():
        print("Warning: Factor contains all NaN values")
        return {'ic_mean': -np.inf, 'sharpe': -np.inf}
        
    # Prepare data for alphalens
    factor_data = factor.stack().reset_index()
    factor_data.columns = ['date', 'asset', 'factor']
    factor_data = factor_data.set_index(['date', 'asset'])
    
    try:
        # Get clean factor data with more robust parameters
        factor_data_aligned = al.utils.get_clean_factor_and_forward_returns(
            factor=factor_data,
            prices=price_df,
            periods=(1, 5, 10, 20),
            quantiles=5,  # Reduced from 10 to 5 for more robust quantile formation
            max_loss=0.35  # More conservative max loss threshold
        )
        
        # Calculate IC with proper handling of edge cases
        ic = al.performance.factor_information_coefficient(factor_data_aligned)
        ic_mean = ic.mean().mean()
        ic_std = ic.std().mean()
        ic_ir = ic_mean / ic_std if ic_std > 0 else -np.inf
        
        # Calculate returns with proper error handling
        returns = al.performance.factor_returns(factor_data_aligned)
        returns = returns.fillna(0)
        
        if len(returns) > 252:  # Ensure we have at least one year of data
            annual_returns = returns.mean() * 252
            annual_vol = returns.std() * np.sqrt(252)
            sharpe = (annual_returns / annual_vol).mean()
        else:
            print("Warning: Insufficient data for reliable Sharpe ratio calculation")
            sharpe = -np.inf
            
        return {
            'ic_mean': ic_mean if not np.isnan(ic_mean) else -np.inf,
            'ic_ir': ic_ir,
            'sharpe': sharpe,
            'data_coverage': len(returns) / 252  # Years of data coverage
        }
        
    except Exception as e:
        print(f"Error in performance evaluation: {str(e)}")
        return {
            'ic_mean': -np.inf,
            'ic_ir': -np.inf,
            'sharpe': -np.inf,
            'data_coverage': 0
        }

def optimize_parameters(master_data, price_df):
    """
    Parameter optimization with enhanced parameter ranges and progress tracking
    
    Parameters:
    -----------
    master_data : dict
        Dictionary containing market data
    price_df : pd.DataFrame
        Price data for assets
        
    Returns:
    --------
    pd.DataFrame
        Results of parameter optimization
    """
    # Define parameter ranges with more granularity
    lookback_periods = [5, 10, 20]
    winsorize_lowers = [0.01]
    winsorize_uppers = [0.95]
    
    # Create parameter combinations
    param_combinations = list(itertools.product(
        lookback_periods,
        winsorize_lowers,
        winsorize_uppers
    ))
    
    # Filter invalid combinations
    param_combinations = [
        (l, wl, wu) for l, wl, wu in param_combinations
        if wl < wu  # Ensure lower bound is less than upper bound
    ]
    
    results = []
    
    # Use tqdm for progress tracking
    for lookback, win_lower, win_upper in tqdm(param_combinations, desc="Optimizing parameters"):
        print(f"\nTesting: lookback={lookback}, winsorize_lower={win_lower}, winsorize_upper={win_upper}")
        
        try:
            # Calculate factor
            factor = calculate_mom6_parameterized(
                master_data,
                lookback_period=lookback,
                winsorize_lower=win_lower,
                winsorize_upper=win_upper
            )
            
            # Evaluate performance
            perf = evaluate_performance(factor, price_df)
            
            # Store results
            results.append({
                'lookback_period': lookback,
                'winsorize_lower': win_lower,
                'winsorize_upper': win_upper,
                'ic_mean': perf['ic_mean'],
                'ic_ir': perf.get('ic_ir', -np.inf),
                'sharpe': perf['sharpe'],
                'data_coverage': perf.get('data_coverage', 0)
            })
            
        except Exception as e:
            print(f"Error with parameters {lookback}, {win_lower}, {win_upper}: {str(e)}")
            continue
    
    # Convert results to DataFrame and sort by multiple metrics
    results_df = pd.DataFrame(results)
    results_df['combined_score'] = (
        results_df['ic_mean'].rank(pct=True) +
        results_df['sharpe'].rank(pct=True) +
        results_df['data_coverage'].rank(pct=True)
    ) / 3
    
    return results_df.sort_values('combined_score', ascending=False)

def plot_optimization_results(results_df):
    """
    Enhanced visualization of optimization results
    
    Parameters:
    -----------
    results_df : pd.DataFrame
        Results from parameter optimization
        
    Returns:
    --------
    matplotlib.figure.Figure
        Figure containing the plots
    """
    fig = plt.figure(figsize=(15, 10))
    
    # Plot 1: IC Mean vs Lookback Period
    ax1 = plt.subplot(221)
    sns.boxplot(data=results_df, x='lookback_period', y='ic_mean', ax=ax1)
    ax1.set_title('IC Mean vs Lookback Period')
    ax1.set_xlabel('Lookback Period (days)')
    ax1.set_ylabel('IC Mean')
    
    # Plot 2: Sharpe vs Lookback Period
    ax2 = plt.subplot(222)
    sns.boxplot(data=results_df, x='lookback_period', y='sharpe', ax=ax2)
    ax2.set_title('Sharpe Ratio vs Lookback Period')
    ax2.set_xlabel('Lookback Period (days)')
    ax2.set_ylabel('Sharpe Ratio')
    
    # Plot 3: Heatmap of Combined Score
    ax3 = plt.subplot(212)
    pivot_data = results_df.pivot_table(
        index='winsorize_lower',
        columns='lookback_period',
        values='combined_score',
        aggfunc='mean'
    )
    sns.heatmap(pivot_data, annot=True, fmt='.2f', cmap='RdYlBu_r', ax=ax3)
    ax3.set_title('Combined Score Heatmap')
    ax3.set_xlabel('Lookback Period (days)')
    ax3.set_ylabel('Winsorize Lower Bound')
    
    plt.tight_layout()
    return fig

def run_optimization(master_data, price_df):
    """
    Run the complete optimization process with enhanced reporting
    
    Parameters:
    -----------
    master_data : dict
        Dictionary containing market data
    price_df : pd.DataFrame
        Price data for assets
        
    Returns:
    --------
    pd.DataFrame
        Optimization results
    """
    print("Starting parameter optimization...")
    results = optimize_parameters(master_data, price_df)
    
    print("\nTop 5 parameter combinations by combined score:")
    print(results.head().to_string())
    
    print("\nBest parameters by IC:")
    print(results.loc[results['ic_mean'].idxmax()].to_string())
    
    print("\nBest parameters by Sharpe:")
    print(results.loc[results['sharpe'].idxmax()].to_string())
    
    # Create and save plots
    fig = plot_optimization_results(results)
    plt.savefig('optimization_results.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return results

### import data 

In [2]:
data_path = "/Users/mouyasushi/Desktop/Factor/alpha_lens/alphalens/alphalens/my_research/data"

master_data = load_csv_to_dict (data_path)
price_df = master_data['master_close']

### Split data 

In [3]:
def split_data_time(master_data, price_df, train_ratio=0.7):  # change ratio here 
    """
    Split data into in-sample and out-of-sample periods based on time
    
    Parameters:
    -----------
    master_data : dict
        Dictionary containing OHLC DataFrames
    price_df : pd.DataFrame
        Price DataFrame for factor analysis
    train_ratio : float
        Ratio for training data (e.g., 0.7 for 70-30 split)
        
    Returns:
    --------
    dict : Contains in-sample and out-of-sample data
    """
    # Calculate split point
    split_index = int(len(price_df) * train_ratio)
    split_date = price_df.index[split_index]
    
    print(f"Split date: {split_date}")
    
    # Split master_data
    insample_master = {}
    outsample_master = {}
    
    for key, df in master_data.items():
        insample_master[key] = df.loc[:split_date]
        outsample_master[key] = df.loc[split_date:]
    
    # Split price_df
    insample_price = price_df.loc[:split_date]
    outsample_price = price_df.loc[split_date:]
    
    # Print split info
    print(f"\nIn-sample period: {insample_price.index[0]} to {insample_price.index[-1]}")
    print(f"Out-of-sample period: {outsample_price.index[0]} to {outsample_price.index[-1]}")
    print(f"\nIn-sample shape: {insample_price.shape}")
    print(f"Out-of-sample shape: {outsample_price.shape}")
    
    return {
        'insample': {
            'master_data': insample_master,
            'price_df': insample_price
        },
        'outsample': {
            'master_data': outsample_master,
            'price_df': outsample_price
        }
    }


In [4]:
# Split the data
split_data = split_data_time(master_data, price_df)

# Access the split data
insample_master_data = split_data['insample']['master_data']
insample_price_df = split_data['insample']['price_df']

outsample_master_data = split_data['outsample']['master_data']
outsample_price_df = split_data['outsample']['price_df']


Split date: 2023-11-16 00:00:00

In-sample period: 2021-10-14 00:00:00 to 2023-11-16 00:00:00
Out-of-sample period: 2023-11-16 00:00:00 to 2024-10-14 00:00:00

In-sample shape: (511, 405)
Out-of-sample shape: (219, 405)


### Implmentation OPT

In [5]:
# 1. Use insample data to optimize parameters
results = run_optimization(insample_master_data, insample_price_df)

# 2. Get best parameters from in-sample results
best_params = results.loc[results['ic_mean'].idxmax()]


Starting parameter optimization...


Optimizing parameters:   0%|          | 0/3 [00:00<?, ?it/s]


Testing: lookback=5, winsorize_lower=0.01, winsorize_upper=0.95


Calculating momentum: 100%|██████████| 405/405 [00:21<00:00, 19.06it/s]
Optimizing parameters:   0%|          | 0/3 [01:34<?, ?it/s]


KeyboardInterrupt: 

### Check Optimized perf 

In [None]:
import quantstats as qs
import pandas as pd

def create_performance_report(returns, benchmark_rets=None, positions=None, output_filename='analysis.html'):
    """
    Create a comprehensive performance report using QuantStats.
    
    Parameters:
    -----------
    returns : pd.Series
        Daily returns of the strategy, noncumulative
    benchmark_rets : pd.Series, optional
        Daily returns of the benchmark
    positions : pd.DataFrame, optional
        Daily position values (not directly used in QuantStats but can be analyzed separately)
    output_filename : str, optional
        Name of the output HTML file
    """
    # Extend pandas functionality with QuantStats
    qs.extend_pandas()
    
    # If benchmark is provided, create a full comparison report
    if benchmark_rets is not None:
        # Create HTML report comparing strategy to benchmark
        qs.reports.html(returns, 
                       benchmark_rets, 
                       output=output_filename,
                       title='Strategy Analysis')
    else:
        # Create HTML report for strategy alone
        qs.reports.html(returns, 
                       output=output_filename,
                       title='Strategy Analysis')
    
    # Print basic metrics to console
    print("\nBasic Performance Metrics:")
    print("-------------------------")
    print(f"Sharpe Ratio: {qs.stats.sharpe(returns):.2f}")
    print(f"Max Drawdown: {qs.stats.max_drawdown(returns):.2%}")
    print(f"Win Rate: {qs.stats.win_rate(returns):.2%}")
    
    # If positions are provided, we can analyze them separately
    if positions is not None:
        print("\nPosition Summary:")
        print("----------------")
        print(f"Average Position Count: {positions.count(axis=1).mean():.2f}")
        print(f"Max Position Count: {positions.count(axis=1).max()}")


# For additional specific metrics:
def print_detailed_metrics(returns, benchmark_rets=None):
    """Print detailed performance metrics"""
    print("\nDetailed Metrics:")
    print("----------------")
    metrics = {
        'Annual Return': qs.stats.cagr(returns),
        'Volatility': qs.stats.volatility(returns),
    }
    
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"{metric}: {value:.2%}" if 'Duration' not in metric else f"{metric}: {value:.0f} days")



In [None]:
path = '/Users/mouyasushi/Desktop/Factor/alpha_lens/alphalens/alphalens/my_research/data/aligned_benchmark.csv'

# Ensure aligned_benchmark has consistent data types
aligned_benchmark = pd.read_csv(path)

# Correct data types in aligned_benchmark
aligned_benchmark['column_name'] = aligned_benchmark['column_name'].astype(float)  # Update 'column_name' as necessary

# Ensure factor_data is processed correctly
factor_data = optimal_factor.stack().reset_index()
factor_data.columns = ['date', 'asset', 'factor']

# Convert 'factor' to numeric to avoid string-related issues
factor_data['factor'] = pd.to_numeric(factor_data['factor'], errors='coerce')

# Ensure proper alignment of factors and returns
factor_data = factor_data.set_index(['date', 'asset'])

factor_data_aligned = al.utils.get_clean_factor_and_forward_returns(
    factor=factor_data,
    prices=price_df,
    periods=periods,
    quantiles=10,
    max_loss=0.5
)

# Generate returns and positions with proper data handling
optimal_returns, optimal_positions, _ = al.performance.create_pyfolio_input(
    factor_data_aligned,
    period='1D',
    capital=1_000_000,
    long_short=True,
    group_neutral=False,
    equal_weight=True,
    quantiles=[1, 10]
)

# Handle performance report generation
create_performance_report(
    returns=optimal_returns,
    benchmark_rets=aligned_benchmark,
    positions=optimal_positions,
    output_filename='optimized_strategy_analysis.html'
)

# Print additional metrics
print_detailed_metrics(optimal_returns, aligned_benchmark)
