In [None]:
import numpy as np
import pandas as pd


def preprocess_trades(trade_ret):
    """Clean and prepare returns data"""
    trade_ret_dates = trade_ret.index
    trade_ret.index = np.arange(trade_ret.shape[0])
    trade_ret.iloc[0, :] = 0

    # Handle delisted stocks
    last_valid_ret_ind = trade_ret.apply(pd.Series.last_valid_index)
    for idx, column in enumerate(trade_ret.columns):
        r = trade_ret.loc[trade_ret.index[0]:last_valid_ret_ind.iloc[idx], column]
        r = r.fillna(0)
        trade_ret.loc[trade_ret.index[0]:last_valid_ret_ind.iloc[idx], column] = r

    return trade_ret, trade_ret_dates


def calculate_normalized_prices(trade_ret):
    """Convert returns to normalized price series starting at 1"""
    return (trade_ret + 1.).cumprod()


def process_single_pair(pair, trade_prc, trade_ret, d_open, wait1d, last_day):
    print('.......process_single_pair')
    """Process trading signals and payoffs for a single pair"""
    pair_calcs = pd.DataFrame(np.zeros((trade_prc.shape[0], 9)),
                              columns=["p_1", "p_2", "s", "direction", "w_1", "w_2", "r_1", "r_2", "payoff"])

    # Set up pair data
    leg_1 = int(pair.leg_1)
    leg_2 = int(pair.leg_2)
    pair_calcs.p_1 = trade_prc.loc[:, leg_1]
    pair_calcs.p_2 = trade_prc.loc[:, leg_2]
    pair_calcs.r_1 = trade_ret.loc[:, leg_1]
    pair_calcs.r_2 = trade_ret.loc[:, leg_2]

    # Calculate normalized spread
    pair_calcs.s = (pair_calcs.p_1 - pair_calcs.p_2) / pair.spread_std

    # Find open/close signals
    open_ids = np.array(trade_ret.index * (np.abs(pair_calcs.s) > d_open))
    open_ids = open_ids[open_ids != 0]
    open_ids = open_ids[open_ids <= last_day]

    close_ids = np.array(trade_ret.index[np.sign(pair_calcs.s).diff() != 0])
    close_ids = close_ids[~np.isnan(close_ids)]
    close_ids = np.append(close_ids, last_day)

    # Process trades
    t_open = open_ids[0] if len(open_ids) != 0 else np.nan
    connections = []
    if ~np.isnan(t_open):
        while ~np.isnan(t_open) & (t_open < last_day - wait1d):
            t_close = np.min(close_ids[close_ids > t_open + wait1d])

            # Set trade direction
            pair_calcs.loc[(t_open + wait1d + 1): (t_close + 1), "direction"] = -np.sign(
                pair_calcs.loc[t_open - wait1d, "s"])
            connections.append((t_open, t_close))
            # Calculate weights
            pair_calcs.w_1[(t_open + wait1d):(t_close + 1)] = np.append(1., (
                    1 + pair_calcs.r_1[(t_open + wait1d): (t_close)]).cumprod())
            pair_calcs.w_2[(t_open + wait1d):(t_close + 1)] = np.append(1., (
                    1 + pair_calcs.r_2[(t_open + wait1d): (t_close)]).cumprod())

            # Move to next trade
            t_open = open_ids[open_ids > t_close][0] if any(open_ids > t_close) else np.nan

    # Calculate payoffs
    pair_calcs["payoffs"] = pair_calcs.direction * (
            pair_calcs.w_1 * pair_calcs.r_1 - pair_calcs.w_2 * pair_calcs.r_2)

    # Optional, graph individual pair open,close positions
    plt.figure(figsize=(15, 5))
    plt.plot(pair_calcs.s, 'k-', alpha=0.3, label='Spread')
    
    for i, (op, cl) in enumerate(connections):
        plt.plot([op, cl], 
                [pair_calcs.s.loc[op], pair_calcs.s.loc[cl]], 
                'b-', alpha=0.5)
        plt.scatter(op, pair_calcs.s.loc[op], color='green', 
                  marker='^', s=100, label='Open' if i==0 else "")
        plt.scatter(cl, pair_calcs.s.loc[cl], color='red',
                  marker='v', s=100, label='Close' if i==0 else "")
    
    plt.title(f'Trade Signals for Pair {pair.leg_1}-{pair.leg_2}')
    plt.xlabel('Date')
    plt.ylabel('Normalized Spread')
    plt.axhline(0, color='gray', linestyle='--', alpha=0.5)
    plt.legend()
    plt.grid(alpha=0.2)
    plt.show()
    # --- Build a trade log from connections ---
    trade_rows = []
    idx = trade_ret.index  # index (int or datetime)
    is_dt = hasattr(idx, "dtype") and "datetime64" in str(idx.dtype)

    for (op, cl) in connections:
        # Execution begins after the 1-day wait: start = op + wait1d + 1
        start = int(op + wait1d + 1)
        end   = int(cl)  # inclusive
        if end < start:
            continue

        # P&L over the trade window (sum of daily payoff)
        pnl = float(np.nansum(pair_calcs.loc[start:end, "payoff"].values))

        # Direction at entry
        dir_entry = float(pair_calcs.loc[start, "direction"]) if start in pair_calcs.index else np.nan

        # Duration in trading days
        if is_dt:
            open_date = pd.to_datetime(idx[op])
            close_date = pd.to_datetime(idx[cl])
            duration_days = (close_date - open_date).days  # calendar days
            # also store trading-bar count
            bar_count = end - start + 1
        else:
            open_date = int(op)
            close_date = int(cl)
            duration_days = int(cl - op)
            bar_count = end - start + 1

        trade_rows.append({
            "leg_1": leg_1,
            "leg_2": leg_2,
            "open_id": int(op),
            "close_id": int(cl),
            "open_date": open_date,
            "close_date": close_date,
            "entry_spread": float(pair_calcs.loc[op, "s"]) if op in pair_calcs.index else np.nan,
            "exit_spread":  float(pair_calcs.loc[cl, "s"]) if cl in pair_calcs.index else np.nan,
            "direction": dir_entry,
            "duration_bars": int(bar_count),
            "duration_days": int(duration_days),
            "pnl": pnl,  # in return units (decimal)
        })
    print('trade_rows ---', trade_rows)
    trade_log = pd.DataFrame(trade_rows)


    return pair_calcs["payoffs"], pair_calcs["direction"], trade_log


def calculate_cc_and_fi_returns(payoffs, directions, trade_ret_dates):
    """Calculate committed capital and fully invested returns"""
    payoffs.index = trade_ret_dates
    directions.index = trade_ret_dates

    # Committed capital approach
    returns_cc = payoffs.mean(axis=1)

    # Fully invested approach
    num_open_pairs = (directions != 0).sum(axis=1)
    num_open_pairs[num_open_pairs > 0] = 1. / num_open_pairs[num_open_pairs > 0]
    weights_fi = pd.concat([num_open_pairs] * payoffs.shape[1], axis=1)
    returns_fi = (weights_fi * payoffs).sum(axis=1)

    return returns_cc, returns_fi


def calculate_pairs_returns(trade_ret, pairs, d_open=2, wait1d=1):
    """Main function to calculate pairs trading returns"""
    # Preprocess returns
    trade_ret, trade_ret_dates = preprocess_trades(trade_ret)

    # Calculate normalized prices
    trade_prc = calculate_normalized_prices(trade_ret)
    trading_days = trade_prc.shape[0]
    num_pairs = pairs.shape[0]

    # Initialize storage
    payoffs = pd.DataFrame(np.zeros((trading_days, num_pairs)))
    directions = pd.DataFrame(np.zeros((trading_days, num_pairs)))
    trade_logs = []  # <-- collect each pair's trade log

    # Process each pair
    for idx_pair, pair in pairs.iterrows():
        last_day = max(trade_prc[pair.leg_1].last_valid_index(),
                       trade_prc[pair.leg_2].last_valid_index())
        p_payoffs, p_directions, p_trades = process_single_pair(
            pair, trade_prc, trade_ret, d_open, wait1d, last_day)
        payoffs.loc[:, idx_pair] = p_payoffs
        directions.loc[:, idx_pair] = p_directions
        if p_trades is not None and len(p_trades):
            # add pair id so we can trace back
            p_trades["pair_id"] = idx_pair
            trade_logs.append(p_trades)


    # Calculate portfolio returns
    returns_cc, returns_fi = calculate_cc_and_fi_returns(payoffs, directions, trade_ret_dates)

    # === Build Table 4.2 metrics from the trade log ===
    if trade_logs:
        trade_log = pd.concat(trade_logs, ignore_index=True)
        total_trades = len(trade_log)
        avg_duration_days = float(trade_log["duration_bars"].mean())  # or "duration_days" if you prefer calendar
        win_rate = float((trade_log["pnl"] > 0).mean() * 100.0)
        avg_profit_per_trade_pct = float(trade_log["pnl"].mean() * 100.0)  # pnl is decimal â†’ %

        perf_table_4_2 = pd.DataFrame({
            "Value": [
                total_trades,
                round(avg_duration_days, 2),
                round(win_rate, 2),
                round(avg_profit_per_trade_pct, 4),
            ]
        }, index=[
            "Total Number of Trades",
            "Average Trade Duration (days)",
            "Win Rate (%)",
            "Average Profit per Trade (%)"
        ])
    else:
        trade_log = pd.DataFrame()
        perf_table_4_2 = pd.DataFrame({
            "Value": [0, np.nan, np.nan, np.nan]
        }, index=[
            "Total Number of Trades",
            "Average Trade Duration (days)",
            "Win Rate (%)",
            "Average Profit per Trade (%)"
        ])


    return {
        "pairs": pairs,
        "directions": directions,
        "payoffs": payoffs,
        "returns_cc": returns_cc,
        "returns_fi": returns_fi,
        "trade_log": trade_log,           # NEW: per-trade details
        "table_4_2": perf_table_4_2       # NEW: summary for thesis

    }

##################### these are functions for backtesting ########################
def initialize_backtest_parameters():
    """Returns configuration parameters for backtest"""
    return {
        'n_formation': 12,    # Formation period in months
        'n_trading': 6,       # Trading period in months
        'num_pairs': 5,      # Number of pairs to select
        'd_open': 2,          # Z-score threshold for opening trades
        'wait1d': 1           # Days to wait before executing
    }

def initialize_results_storage(dates, n_trading):
    """Initialize DataFrames for storing results"""
    columns = [f"P_{i+1}" for i in range(n_trading)]
    zeros_df = pd.DataFrame(np.zeros((len(dates), n_trading)), 
                           index=dates, 
                           columns=columns)
    
    return {
        'returns_cc': zeros_df.copy(),  # Committed capital returns
        'returns_fi': zeros_df.copy(),  # Fully invested returns
        'num_open_pairs': zeros_df.copy()  # Count of open pairs
    }

def prepare_monthly_indices(dates):
    """Create monthly indices for formation/trading period segmentation"""
    month_id = pd.Series(dates.month)
    month_id = (month_id.diff() != 0).cumsum()
    month_id[0] = 0
    return month_id

def run_portfolio_backtest(port_num, month_id, params, ret, vol, results):
    """Run backtest for a single portfolio"""
    port_name = f"P_{port_num+1}"
    print(f"Running portfolio {port_num+1} of {params['n_trading']}")
    
    unique_months = month_id.unique()
    date_range = np.arange(
        start=params['n_formation'] + port_num,
        stop=len(unique_months) - params['n_trading'] + 1,
        step=params['n_trading']
    )

    for month_idx in date_range:
        # Get formation and trading periods
        form_months = unique_months[month_idx-params['n_formation']:month_idx]
        trade_months = unique_months[month_idx:month_idx+params['n_trading']]
        
        # Get date ranges
        form_dates = get_date_range(month_id, form_months, dates)
        trade_dates = get_date_range(month_id, trade_months, dates)
        
        # Select eligible stocks
        form_ret, trade_ret = select_eligible_stocks(ret, vol, form_dates, trade_dates)
        
        # Select pairs and calculate returns
        pairs = get_pairs(form_ret, params['num_pairs'])
        trades = calculate_pairs_returns(
            trade_ret, 
            pairs, 
            params['d_open'], 
            params['wait1d']
        )
        
        # Store results
        store_results(results, trades, trade_dates, port_name)

def get_date_range(month_id, target_months, dates):
    """Get start and end dates for given month indices"""
    start_date = dates[month_id == target_months.min()][0]
    end_date = dates[month_id == target_months.max()][-1]
    return pd.date_range(start_date, end_date)

def select_eligible_stocks(ret, vol, form_dates, trade_dates):
    """Filter stocks with complete data and sufficient volume"""
    form_ret = ret[form_dates[0]:form_dates[-1]].copy()
    form_vol = vol[form_dates[0]:form_dates[-1]].fillna(0)
    
    # Stocks with no missing returns and all volumes > 0
    ava_stocks = (form_ret.isna().sum() == 0) & ((form_vol == 0).sum() == 0)
    
    return (
        ret.loc[form_dates[0]:form_dates[-1], ava_stocks],
        ret.loc[trade_dates[0]:trade_dates[-1], ava_stocks]
    )

def store_results(results, trades, trade_dates, port_name):
    """Store backtest results in preallocated DataFrames"""
    results['returns_cc'].loc[trade_dates[0]:trade_dates[-1], port_name] = trades["returns_cc"].values
    results['returns_fi'].loc[trade_dates[0]:trade_dates[-1], port_name] = trades["returns_fi"].values
    results['num_open_pairs'].loc[trade_dates[0]:trade_dates[-1], port_name] = (trades["directions"]!=0).sum(axis=1).values

def main_backtest(dates, ret, vol):
    """Complete backtest execution"""
    params = initialize_backtest_parameters()
    results = initialize_results_storage(dates, params['n_trading'])
    month_id = prepare_monthly_indices(dates)
    
    for port_num in range(params['n_trading']):
        tic = time.perf_counter()
        run_portfolio_backtest(port_num, month_id, params, ret, vol, results)
        toc = time.perf_counter()
        print(f"Portfolio completed in {(toc - tic)/60.:0.2f} minutes")
    
    return results
############################# functions to aggregate returns ###################
def calculate_aggregate_returns(results):
    strat_returns_cc_w1d = results['returns_cc']
    strat_returns_fi_w1d = results['returns_fi']
    
    # Average across all portfolios
    ret_cc = strat_returns_cc.mean(axis=1)
    ret_fi = strat_returns_fi.mean(axis=1)
    
    # Create DataFrames
    ret_daily = pd.DataFrame({
        'ret_cc': ret_cc,
        'ret_fi': ret_fi
    })
    
    # Monthly compounded returns
    ret_monthly = ret_daily.resample('M').agg(lambda x: (x + 1).prod() - 1)
    
    # 12-month moving averages
    ret_12_month = ret_monthly.rolling(12).mean()
    
    return {
        'daily': ret_daily,
        'monthly': ret_monthly,
        '12_month_ma': ret_12_month
    }

def plot_strategy_returns(returns_data, title_suffix=""):
    """
    Plot strategy returns with both CC and FI versions
    Args:
        returns_data: Dictionary from calculate_aggregate_returns()
        title_suffix: Optional string to append to title
    """
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8), sharex=True)
    
    # Committed Capital plot
    ax1.plot(returns_data['monthly']["ret_cc"], 
             color="gray", 
             label="Monthly return (CC)")
    ax1.plot(returns_data['12_month_ma']["ret_cc"], 
             color="blue", 
             label="12-month MA (CC)")
    ax1.set_title(f"Monthly Returns - Committed Capital {title_suffix}")
    ax1.legend()
    ax1.grid(alpha=0.3)
    
    # Fully Invested plot
    ax2.plot(returns_data['monthly']["ret_fi"], 
             color="lightcoral", 
             label="Monthly return (FI)")
    ax2.plot(returns_data['12_month_ma']["ret_fi"], 
             color="darkred", 
             label="12-month MA (FI)")
    ax2.set_title(f"Monthly Returns - Fully Invested {title_suffix}")
    ax2.legend()
    ax2.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    return fig

# Usage Example:
# returns_data = calculate_aggregate_returns(strat_returns_cc_w1d, strat_returns_fi_w1d)
# plot_strategy_returns(returns_data, title_suffix="GGR Strategy with 20 Pairs")
# plt.show()

import pandas as pd
import numpy as np

def calculate_sharpe(returns, risk_free_rate=0.0):
    """Annualized Sharpe ratio"""
    excess_returns = returns - risk_free_rate/252
    return np.sqrt(252) * excess_returns.mean() / excess_returns.std()

def calculate_cagr(returns):
    """Compound Annual Growth Rate"""
    cum_return = (1 + returns).prod()
    years = len(returns) / 252
    return cum_return ** (1/years) - 1

def max_dd(returns):
    """Maximum Drawdown"""
    p = (1 + returns).cumprod()
    roll_max = p.cummax()
    drawdown = p/roll_max - 1.0
    return -drawdown.min()

def calculate_calmar(returns):
    """Calmar Ratio (CAGR/MaxDD)"""
    cagr = calculate_cagr(returns)
    dd = max_dd(returns)
    return cagr / abs(dd) if dd != 0 else np.nan

def analyze_performance(daily_returns):
    """Comprehensive performance metrics"""
    return {
        'Sharpe': calculate_sharpe(daily_returns),
        'CAGR': calculate_cagr(daily_returns),
        'MaxDD': max_dd(daily_returns),
        'Calmar': calculate_calmar(daily_returns),
        'AnnualVol': daily_returns.std() * np.sqrt(252)
    }

def create_performance_summary(backtest_results):
    """Generate professional performance report"""
    
    # Aggregate returns
    cc_returns = backtest_results['returns_cc'].mean(axis=1)
    fi_returns = backtest_results['returns_fi'].mean(axis=1)
    
    # Calculate metrics
    cc_metrics = analyze_performance(cc_returns)
    fi_metrics = analyze_performance(fi_returns)
    
    # Create summary table
    summary = pd.DataFrame({
        'Committed Capital': cc_metrics,
        'Fully Invested': fi_metrics
    }).T
    
    # Formatting
    summary['CAGR'] = summary['CAGR'].map('{:.2%}'.format)
    summary['MaxDD'] = summary['MaxDD'].map('{:.2%}'.format)
    summary['AnnualVol'] = summary['AnnualVol'].map('{:.2%}'.format)
    summary['Sharpe'] = summary['Sharpe'].map('{:.2f}'.format)
    summary['Calmar'] = summary['Calmar'].map('{:.2f}'.format)
    
    return summary.sort_index()

# Usage Example:
# results = main_backtest(dates, ret, vol)  # Your existing backtest
# performance_report = create_performance_summary(results)

# print("PAIRS TRADING PERFORMANCE REPORT")
# print("="*40)
# print(performance_report)
