# Inputs
Stock symbos, Risk Preferences, and Risk-free rate

In [50]:
# Identify the asset list in Thai stock market (Recommend at least 20 assets)
# The example below is the list of stocks in SET100 index
# Note: We will automatically added GLD: Gold ETF into this symbols
symbols = [
    "AAV", "ADVANC", "AEONTS", "AMATA", "AOT", "AP", "AURA", "AWC", "BA", "BAM",
    "BANPU", "BBL", "BCH", "BCP", "BCPG", "BDMS", "BEM", "BGRIM", "BH", "BJC",
    "BLA", "BTG", "BTS", "CBG", "CCET", "CENTEL", "CHG", "CK", "COM7", "CPALL",
    "CPF", "CPN", "CRC", "DELTA", "DOHOME", "EA", "EGCO", "ERW", "GFPT", "GLOBAL",
    "GPSC", "GULF", "GUNKUL", "HANA", "HMPRO", "ICHI", "IRPC", "IVL", "JAS", "JMART",
    "JMT", "JTS", "KBANK", "KCE", "KKP", "KTB", "KTC", "LH", "M", "MEGA", "MINT",
    "MOSHI", "MTC", "OR", "OSP", "PLANB", "PR9", "PRM", "PTG", "PTT", "PTTEP",
    "PTTGC", "QH", "RATCH", "RCL", "SAWAD", "SCB", "SCC", "SCGP", "SIRI", "SISB",
    "SJWD", "SPALI", "SPRC", "STA", "STECON", "STGT", "TASCO", "TCAP", "TFG",
    "TIDLOR", "TISCO", "TLI", "TOA", "TOP", "TRUE", "TTB", "TU", "VGI", "WHA"
]

In [51]:
# Specify the risk-free rate (Assuming cash deposit with broker give interest at 0.5% per annum)
risk_free_rate = 0.005 # 0.5%

In [52]:
# Reference mapping:
# 0: sideways : set lambda = 1
# 1: Bears: set lambda = 2 (greater than 1)
# 2: Bull set lambda as 0.5 (less than 1)

lambda_map = {0: 1, 1: 2, 2: 0.5}

# Process: Algorithm

In [53]:
import time
# Capture the start time for the entire optimization and output process
start_time = time.time()

## Import Library

In [54]:
import joblib
import numpy as np
import tensorflow as tf
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import cvxpy as cvx
from sklearn.covariance import LedoitWolf

## Import Trained Models

In [55]:
# GitHub raw URL for the trained models
GITHUB_RAW_URL = "https://raw.githubusercontent.com/adisorn242/2026_WQU_CapstoneProject/main/Trained%20Models"

model_files = [
    'scaler.joblib',
    'base_lr.joblib',
    'base_xgb.joblib',
    'base_svm.joblib',
    'base_rf.joblib',
    'base_dnn.keras',
    'meta_learner.joblib'
]

# Download missing files to the local Colab environment
for file in model_files:
    if not os.path.exists(file):
        url = f"{GITHUB_RAW_URL}/{file}".replace(" ", "%20")
        os.system(f"wget {url}")

# Load preprocessing and Scikit-Learn/XGBoost components
scaler = joblib.load('scaler.joblib')
base_lr = joblib.load('base_lr.joblib')
base_xgb = joblib.load('base_xgb.joblib')
base_svm = joblib.load('base_svm.joblib')
base_rf = joblib.load('base_rf.joblib')
meta_learner = joblib.load('meta_learner.joblib')

# Load the DNN model with compile=False to prevent optimizer variable warnings.
# The optimizer is only needed for training; for prediction, we only need the weights.
base_dnn = tf.keras.models.load_model('base_dnn.keras', compile=False)

In [56]:
def voter_predict(X_scaled):
    """
    Manual implementation of the Soft Voting (Majority Voter) logic.
    Calculates the average probability across all 5 base learners and
    applies a 0.5 threshold to determine the market regime.
    """
    # Extract probabilities for the positive class (Regime 1) from Scikit-Learn/XGB models
    p1 = base_lr.predict_proba(X_scaled)[:, 1]
    p2 = base_xgb.predict_proba(X_scaled)[:, 1]
    p3 = base_svm.predict_proba(X_scaled)[:, 1]
    p4 = base_rf.predict_proba(X_scaled)[:, 1]

    # DNN returns the probability directly for binary classification
    p5 = base_dnn.predict(X_scaled, verbose=0).flatten()

    # Compute the arithmetic mean of all model probabilities
    avg_prob = (p1 + p2 + p3 + p4 + p5) / 5

    # Return binary classification (1 for Bull/High Volatility, 0 for Bear/Low Volatility)
    return (avg_prob >= 0.5).astype(int)

## Get Stock and Index data

In [57]:
# Identify the asset list in Thai stock market (Recommend at least 20 assets)
# The example below is the list of stocks in SET100 index
# Note: We will automatically added GLD: Gold ETF into this symbols
# [List of symbols provided above...]

# Add Gold ETF to the list
if "GLD" not in symbols:
    symbols.append("GLD")

# Format tickers for Yahoo Finance using the .BK suffix
all_tickers = [s + ".BK" for s in symbols]

# Calculate date range for exactly 105 weeks
# Enough thinking
end_date = datetime.now()
start_date = end_date - timedelta(weeks=105)

# Fetch weekly data
# interval="1wk" captures weekly candles
# auto_adjust=True handles corporate actions for price consistency
# multi_level_index=False flattens the column headers for direct indexing
data = yf.download(
    all_tickers,
    start=start_date.strftime('%Y-%m-%d'),
    end=end_date.strftime('%Y-%m-%d'),
    interval="1wk",
    auto_adjust=True,
    progress=False,
    multi_level_index=False
)

# Extract Open and Close prices; .tail(105) ensures the total count is exactly 105
df_all_open = data['Open'].tail(105)
df_all_close = data['Close'].tail(105)

# Compute Simple Return: (Close / Open) - 1
df_asset_returns = (df_all_close / df_all_open) - 1

# Integrate the Risk-Free Asset (Must be named 'RiskFree_Rate' for the MVO function)
# risk_free_rate = 0.005 (0.5% per annum)
rf_weekly = risk_free_rate / 52
df_asset_returns['RiskFree_Rate'] = rf_weekly

print(f"Captured {len(df_all_close)} weeks.")
print(f"Total Assets (including Gold and RiskFree_Rate): {len(df_asset_returns.columns)}")

Captured 105 weeks.
Total Assets (including Gold and RiskFree_Rate): 102


  df_asset_returns['RiskFree_Rate'] = rf_weekly


In [58]:
# Download SET Index data using the pre-defined date range
# The ^SET.BK ticker represents the benchmark index for the Stock Exchange of Thailand
df_set_index = yf.download(
    "^SET.BK",
    start=start_date.strftime('%Y-%m-%d'),
    end=end_date.strftime('%Y-%m-%d'),
    interval="1wk",
    auto_adjust=True,
    progress=False,
    multi_level_index=False
)

# Align the index data with the asset data by taking the last 105 weeks
# This ensures consistency for feature engineering and regime detection
df_set_index = df_set_index.tail(105)

print(f"SET Index data ready: {len(df_set_index)} weeks.")

SET Index data ready: 105 weeks.


## Data Preparation

In [59]:
# Compute the weekly simple returns for the SET100 constituents
# Formula: (Close / Open) - 1
df_asset_returns = (df_all_close / df_all_open) - 1

# Check for any missing values and verify shape
print(f"Simple returns calculated. Shape: {df_asset_returns.shape}")

Simple returns calculated. Shape: (105, 101)


In [60]:
# Prepare SET features for model prediction

# 1. Initialize the feature dataframe with the current weekly log return as 'lag1'
# This is named 'SET_log_ret_lag1' to match the training schema
df_SET_features = np.log(df_set_index['Close'] / df_set_index['Open']).to_frame(name='SET_log_ret_lag1')

# 2. Calculate Open-to-High and Open-to-Low log returns for the current period
# Note: Renamed to include '_lag1' to match the feature names seen at fit time
df_SET_features['SET_OH_log_ret_lag1'] = np.log(df_set_index['High'] / df_set_index['Open'])
df_SET_features['SET_OL_log_ret_lag1'] = np.log(df_set_index['Low'] / df_set_index['Open'])

# 3. Generate Lag 2 up to Lag 52 based on the 'SET_log_ret_lag1' column
for i in range(2, 53):
    df_SET_features[f'SET_log_ret_lag{i}'] = df_SET_features['SET_log_ret_lag1'].shift(i-1)

# --- Add technical analysis indicators ---

# 1. MACD (12, 26, 9)
ema_12 = df_set_index['Close'].ewm(span=12, adjust=False, min_periods=12).mean()
ema_26 = df_set_index['Close'].ewm(span=26, adjust=False, min_periods=26).mean()
macd_line = ema_12 - ema_26
macd_histogram = macd_line - macd_line.ewm(span=9, adjust=False, min_periods=9).mean()

# 2. RSI (14-period Wilder's)
delta = df_set_index['Close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
avg_gain = gain.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
avg_loss = loss.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
rsi = 100 - (100 / (1 + (avg_gain / avg_loss)))

# 3. Money Flow Index (MFI - 14-period)
tp = (df_set_index['High'] + df_set_index['Low'] + df_set_index['Close']) / 3
mf = tp * df_set_index['Volume']
pos_f = (mf.where(tp > tp.shift(1), 0)).rolling(window=14, min_periods=14).sum()
neg_f = (mf.where(tp < tp.shift(1), 0)).rolling(window=14, min_periods=14).sum()
mfi = 100 - (100 / (1 + (pos_f / neg_f)))

# 4. Average True Range (ATR - 14-period)
tr = pd.concat([df_set_index['High'] - df_set_index['Low'],
                abs(df_set_index['High'] - df_set_index['Close'].shift(1)),
                abs(df_set_index['Low'] - df_set_index['Close'].shift(1))], axis=1).max(axis=1)
atr = tr.rolling(window=14, min_periods=14).mean()

# 5. On-Balance Volume (OBV)
obv = (np.sign(df_set_index['Close'].diff()) * df_set_index['Volume']).fillna(0).cumsum()

# 6. Lagging and Merging (No dropping)
df_temp_ta_raw = pd.DataFrame(index=df_set_index.index)
ta_data = {
    'SET_MACD_lag_1': macd_line,
    'SET_MACD_Hist_lag_1': macd_histogram,
    'SET_RSI_lag_1': rsi,
    'SET_MFI_lag_1': mfi,
    'SET_ATR_lag_1': atr,
    'SET_OBV_lag_1': obv
}

for col_name, data_series in ta_data.items():
    df_temp_ta_raw[col_name] = data_series.shift(1)

# Join the dataframes without dropping NA rows
df_SET_features = df_SET_features.join(df_temp_ta_raw)

# 7. Isolate and Normalize/Scale the last row for prediction
# We use the final row representing the latest state
latest_row = df_SET_features.tail(1)

# Ensure the column order matches exactly what the scaler expects
# (Optional but recommended if the order might have shifted)
latest_row = latest_row[scaler.feature_names_in_]

# Scale the feature vector using the pre-fitted scaler
X_latest_scaled = scaler.transform(latest_row)

print("Feature engineering complete. Last row isolated and scaled.")
print(f"Scaled feature vector shape: {X_latest_scaled.shape}")

Feature engineering complete. Last row isolated and scaled.
Scaled feature vector shape: (1, 60)


## Regime Detection for next period

In [61]:
# 1. Define the list of base learners
base_learners = [base_lr, base_xgb, base_svm, base_rf, base_dnn]

# 2. Convert X_latest_scaled back to a DataFrame to satisfy the "feature names" requirement
# We use the feature names from the scaler to ensure an exact match
X_latest_df = pd.DataFrame(X_latest_scaled, columns=scaler.feature_names_in_)

# 3. Generate Meta-Features
base_preds = []

for model in base_learners:
    if model == base_dnn:
        # Keras doesn't care about feature names in the same way, but we pass the array
        pred = model.predict(X_latest_scaled, verbose=0)
    else:
        # Pass the DataFrame (X_latest_df) to resolve the UserWarning
        pred = model.predict_proba(X_latest_df)

    base_preds.append(pred)

# Combine base predictions into the meta-feature vector
meta_features = np.hstack(base_preds)

# 4. Stacking Prediction (Meta-Learning)
# Note: The meta_learner was likely trained on a raw array of probabilities,
# so we pass the meta_features array directly here.
regime_stacking = meta_learner.predict(meta_features)[0]
prob_stacking = meta_learner.predict_proba(meta_features)[0]

# 5. Voting Prediction (Soft Voting)
avg_probabilities = np.mean(base_preds, axis=0)
regime_voting = np.argmax(avg_probabilities)
prob_voting = avg_probabilities[0]

# Store results
prediction_results = {
    "stacking": {"regime": regime_stacking, "probability": prob_stacking},
    "voting": {"regime": regime_voting, "probability": prob_voting}
}

print(f"Stacking Regime: {regime_stacking} (Confidence: {max(prob_stacking)*100:.2f}%)")
print(f"Voting Regime:   {regime_voting} (Confidence: {max(prob_voting)*100:.2f}%)")

Stacking Regime: 2 (Confidence: 90.93%)
Voting Regime:   2 (Confidence: 45.99%)


## Portfolio Optimization

In [62]:
def get_dynamic_mvo_weights(window_ret, lambda_val, regime):
    """
    Finalized Strategic Logic:
    - Regime 1 (Bear): Gold <= 20%, TOTAL Stocks <= 10%, Risk-Free takes remainder (min 70%).
    - Regime 2 (Bull): Gold <= 5%, Risk-Free <= 5%, Individual Stocks <= 10% (High Conviction).
    - Regime 0 (Neutral): all Assets (Stocks/Gold) <= 5%, Risk-Free <= 100%.
    """
    # 1. Ensure regime is an integer
    regime = int(regime)

    # 2. Data Preparation
    clean_window = window_ret.dropna(axis=1)
    assets = clean_window.columns.tolist()
    n = len(assets)

    if n == 0:
        return np.zeros(len(window_ret.columns))

    # 3. Calculate Mean and Ledoit-Wolf Shrunk Covariance
    mu = clean_window.mean().values
    lw = LedoitWolf()
    Sigma_shrunk = lw.fit(clean_window).covariance_

    # 4. Variables and Base Constraints
    w = cvx.Variable(n)
    constraints = [cvx.sum(w) == 1, w >= 0]

    # Identify indices for stocks
    stock_indices = [i for i, name in enumerate(assets) if name not in ['GLD.BK', 'RiskFree_Rate']]

    # 5. Define Regime-Specific Constraints
    for i, asset_name in enumerate(assets):
        if regime == 1:  # --- BEAR MARKET ---
            if asset_name == 'GLD.BK':
                constraints.append(w[i] <= 0.20)  # Gold cap 20%
            elif asset_name == 'RiskFree_Rate':
                constraints.append(w[i] <= 1.00)
            else:
                constraints.append(w[i] <= 0.05)  # Individual stock cap

        elif regime == 2:  # --- BULL MARKET ---
            if asset_name == 'GLD.BK' or asset_name == 'RiskFree_Rate':
                constraints.append(w[i] <= 0.05)  # Defensive limited to 5%
            else:
                constraints.append(w[i] <= 0.10)  # High conviction stock cap

        else:  # --- NEUTRAL MARKET ---
            if asset_name == 'RiskFree_Rate':
                constraints.append(w[i] <= 1.00)  # Cash remains the flexible buffer
            else:
                # Gold and Stocks are treated equally here (max 5% each)
                constraints.append(w[i] <= 0.05)

    # 6. Apply TOTAL Stock Constraint specifically for BEAR Market
    if regime == 1 and len(stock_indices) > 0:
        constraints.append(cvx.sum(w[stock_indices]) <= 0.10)

    # 7. Objective: Maximize Risk-Adjusted Utility
    # Using psd_wrap to ensure CVXPY recognizes Sigma as Positive Semi-Definite
    risk = cvx.quad_form(w, cvx.psd_wrap(Sigma_shrunk))
    objective = cvx.Maximize(mu @ w - 0.5 * lambda_val * risk)

    # 8. Solve using OSQP solver
    prob = cvx.Problem(objective, constraints)
    prob.solve(solver=cvx.OSQP)

    # 9. Map weights back to the original stock list (filling 0 for dropped NAs)
    final_weights = pd.Series(0.0, index=window_ret.columns)
    if w.value is not None:
        final_weights[assets] = w.value

    return final_weights.values

In [63]:
# 1. Extract predicted regimes from the ensemble results
regime_s = prediction_results["stacking"]["regime"]
regime_v = prediction_results["voting"]["regime"]

# 2. Calculate optimal weights for the Stacking Regime
# Uses the lambda_map defined in the previous input block
weights_stacking = get_dynamic_mvo_weights(
    window_ret=df_asset_returns,
    lambda_val=lambda_map[regime_s],
    regime=regime_s
)

# 3. Calculate optimal weights for the Voting Regime
weights_voting = get_dynamic_mvo_weights(
    window_ret=df_asset_returns,
    lambda_val=lambda_map[regime_v],
    regime=regime_v
)

# 4. Consolidate into a final DataFrame
df_final_weights = pd.DataFrame({
    'Stacking_Weight': weights_stacking,
    'Voting_Weight': weights_voting
}, index=df_asset_returns.columns)

# Output
Optimal Asset Weights

In [64]:
# 1. Capture end time and calculate duration
end_time = time.time()
total_duration = end_time - start_time

# 2. Define descriptive names for regimes
regime_names = {0: "Sideways", 1: "Bear", 2: "Bull"}

# 3. Preparation of results for display
regime_s = prediction_results["stacking"]["regime"]
regime_v = prediction_results["voting"]["regime"]
prob_s = np.max(prediction_results["stacking"]["probability"])
prob_v = np.max(prediction_results["voting"]["probability"])

# 4. Print Configuration and Parameters
print("="*65)
print(f"{'STRATEGY CONFIGURATION & PARAMETERS':^65}")
print("="*65)
print(f"Risk Aversion Mapping (lambda_map):")
for r_code, l_val in lambda_map.items():
    print(f"  - Regime {r_code} ({regime_names[r_code]:8}): lambda = {l_val}")

print(f"\nRisk-Free Rate (Annual): {risk_free_rate:.2%}")
print(f"Risk-Free Rate (Weekly): {(risk_free_rate/52):.6%}")
print("-" * 65)

# 5. Print Model Predictions
print(f"{'MARKET REGIME PREDICTIONS':^65}")
print("-" * 65)
print(f"STACKING MODEL: Regime {regime_s} ({regime_names[regime_s]}) | Conf: {prob_s:.2%}")
print(f"VOTING MODEL:   Regime {regime_v} ({regime_names[regime_v]}) | Conf: {prob_v:.2%}")
print("-" * 65)

# 6. Filter and format the Weight DataFrame
df_output = df_final_weights[(df_final_weights['Stacking_Weight'] > 0.0001) |
                             (df_final_weights['Voting_Weight'] > 0.0001)].copy()
df_output = df_output.sort_values(by='Stacking_Weight', ascending=False)

print(f"Active Portfolio Weights ({len(df_output)} Assets):")
display(df_output.style.format({
    'Stacking_Weight': '{:.2%}',
    'Voting_Weight': '{:.2%}'
}))

# 7. Final Validation and Timing
print("-" * 65)
print(f"Total Portfolio Weight (Stacking): {df_final_weights['Stacking_Weight'].sum():.2%}")
print(f"Total Portfolio Weight (Voting):   {df_final_weights['Voting_Weight'].sum():.2%}")
print("-" * 65)
print(f"TOTAL EXECUTION TIME: {total_duration:.2f} seconds")
print("="*65)

               STRATEGY CONFIGURATION & PARAMETERS               
Risk Aversion Mapping (lambda_map):
  - Regime 0 (Sideways): lambda = 1
  - Regime 1 (Bear    ): lambda = 2
  - Regime 2 (Bull    ): lambda = 0.5

Risk-Free Rate (Annual): 0.50%
Risk-Free Rate (Weekly): 0.009615%
-----------------------------------------------------------------
                    MARKET REGIME PREDICTIONS                    
-----------------------------------------------------------------
STACKING MODEL: Regime 2 (Bull) | Conf: 90.93%
VOTING MODEL:   Regime 2 (Bull) | Conf: 45.99%
-----------------------------------------------------------------
Active Portfolio Weights (11 Assets):


Unnamed: 0_level_0,Stacking_Weight,Voting_Weight
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
ADVANC.BK,10.00%,10.00%
CCET.BK,10.00%,10.00%
DELTA.BK,10.00%,10.00%
KBANK.BK,10.00%,10.00%
KKP.BK,10.00%,10.00%
KTB.BK,10.00%,10.00%
STGT.BK,10.00%,10.00%
TRUE.BK,10.00%,10.00%
TFG.BK,10.00%,10.00%
SCB.BK,5.00%,5.00%


-----------------------------------------------------------------
Total Portfolio Weight (Stacking): 100.00%
Total Portfolio Weight (Voting):   100.00%
-----------------------------------------------------------------
TOTAL EXECUTION TIME: 5.16 seconds
