# Get VN Data


In [None]:
import pandas as pd
import yfinance as yf
from vnstock import Vnstock
from typing import List, Dict, Tuple
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller

import os
import pandas as pd

In [None]:

def get_stock_data(symbols, start_date, end_date, interval='1D'):
    stock_data = pd.DataFrame()  # Initialize empty DataFrame
    for symbol in symbols:
        # Fetch historical data for the symbol
        stock = Vnstock().stock(symbol=symbol, source='VCI')
        historical_data = stock.quote.history(
            start=start_date, 
            end=end_date, 
            interval=interval
        )
        # Set 'time' as the index and keep only the 'Close' column
        close_prices = historical_data[['close', 'time']].set_index('time')
        close_prices = close_prices.rename(columns={'close': symbol})  # Rename column to symbol
        # Concatenate with the main DataFrame
        stock_data = pd.concat([stock_data, close_prices], axis=1).dropna()
    return stock_data

In [66]:
data.index

DatetimeIndex(['2020-01-02', '2020-01-03', '2020-01-06', '2020-01-07',
               '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-13',
               '2020-01-14', '2020-01-15',
               ...
               '2022-12-16', '2022-12-19', '2022-12-20', '2022-12-21',
               '2022-12-22', '2022-12-23', '2022-12-27', '2022-12-28',
               '2022-12-29', '2022-12-30'],
              dtype='datetime64[ns]', name='Date', length=756, freq=None)

In [None]:


# Define portfolio
stocks = ['ACB', 'BCM', 'BID', 'BVH', 'CTG', 'FPT', 'GAS', 'GVR', 'HDB', 'HPG', 'LPB', 'MBB', 'MSN', 'MWG', 
          'PLX', 'SAB', 'SHB', 'SSI', 'STB', 'TCB', 'TPB', 'VCB', 'VHM', 'VIB', 'VIC', 'VJC', 'VNM', 'VRE', 
          'VPB', 'FUEVFVND', 'FUESSVFL', 'E1VFVN30', 'FUEVN100']
etf = 'VN30F1M'
tickers = [etf] + stocks  # Note: I assume you meant 'stocks' instead of 'symbols' here
start_date = '2024-01-01'
end_date = '2024-12-31'
file_path = 'data\\stock_data.csv'

# Check if file exists and load data accordingly
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    data['Date'] = pd.to_datetime(data['time'])
    data = data.set_index('Date')
else:
    data = get_stock_data(tickers, start_date, end_date, '1D')
    data.to_csv(file_path, index=True)
    data['Date'] = pd.to_datetime(data['time'])
    data = data.set_index('Date')
    # Optionally save the fetched data to CSV for future use
    


# Parameters
estimation_window = 60  # Days for estimation
threshold = 0.05  # Minimum coefficient size
max_stocks = 6  # Maximum stocks in combination
confidence_level = 1  # 0=90%, 1=95%, 2=99%
confidence_level_joh_final=2
adf_significance = 0.005  # 1% significance for ADF test
std_multiplier = 2.5  # Threshold for residual deviation

# Function to select stocks and ensure stationary residuals
def select_stocks(window_data, etf, stocks, threshold, max_stocks):
    data = window_data
    candidates = []
    
    # Step 1: Pairwise cointegration at 90% confidence
    for stock in stocks:
        try:
            result = coint_johansen(data[[etf, stock]], det_order=0, k_ar_diff=1)
            if result.lr1[0] > result.cvt[0, confidence_level]:
                candidates.append((stock, result.lr1[0]))
        except Exception as e:
            print(f"Pairwise test failed for {stock}: {e}")
            continue

    if not candidates:
        return None
    
    # Sort by trace statistic
    candidates.sort(key=lambda x: x[1], reverse=True)
    filtered_stocks = [stock for stock, _ in candidates]
    
    # Step 2: Build combination incrementally
    selected = [candidates[0][0]]
    best_trace_stat = candidates[0][1]
    
    while len(selected) < max_stocks and filtered_stocks:
        best_next = None
        best_next_trace = 0
        
        for stock in filtered_stocks:
            if stock not in selected:
                test_subset = selected + [stock]
                try:
                    result = coint_johansen(data[[etf] + test_subset], det_order=0, k_ar_diff=0)
                    if result.lr1[0] > result.cvt[0, confidence_level]:
                        evec = result.evec[:, 0]
                        betas = -evec[1:] / evec[0]
                        if all(beta >= 0 for beta in betas):
                            if result.lr1[0] > best_next_trace:
                                best_next_trace = result.lr1[0]
                                best_next = stock
                except Exception as e:
                    print(f"Combination test failed: {e}")
                    continue
        
        if best_next and best_next_trace > best_trace_stat:
            selected.append(best_next)
            best_trace_stat = best_next_trace
        else:
            break

    # Final validation with ADF test
    try:
        result = coint_johansen(data[[etf] + selected], det_order=0, k_ar_diff=1)
        if result.lr1[0] > result.cvt[0, confidence_level_joh_final]:
            evec = result.evec[:, 0]
            betas = -evec[1:] / evec[0]
            if all(beta >= 0 for beta in betas):
                selected_betas = {s: b for s, b in zip(selected, betas) if abs(b) > threshold}
                residuals = data[etf] - sum(data[s] * b for s, b in selected_betas.items())
                adf_result = adfuller(residuals)
                if adf_result[1] < adf_significance:  # Stationary if p-value < 0.05
                    return selected_betas
    except Exception as e:
        print(f"Final validation failed: {e}")
    
    return None

# Rolling strategy with dynamic trading
results = []
active_combinations = []
combination_id = 0

for day in range(estimation_window, len(data)):
    # Check for new combination (one per day)
    estimation_data = data.iloc[day - estimation_window:day]
    if not any(comb['start_day'] == day for comb in active_combinations):
        selected_betas = select_stocks(estimation_data, etf, stocks, threshold, max_stocks)
        if selected_betas:
            combination_id += 1
            estimation_residuals = estimation_data[etf] - sum(
                estimation_data[s] * b for s, b in selected_betas.items()
            )
            active_combinations.append({
                'id': combination_id,
                'betas': selected_betas,
                'start_day': day,
                'estimation_residuals': estimation_residuals
            })
            for i, res in enumerate(estimation_residuals):
                row = {
                    'Date': estimation_data.index[i],
                    'Combination_ID': combination_id,
                    'Residual': res,
                    'Total_Combinations': len(active_combinations),
                    'Num_Stocks': len(selected_betas),
                    'Is_Estimation': True  # Flag to distinguish estimation vs trading
                }
                for stock, beta in selected_betas.items():
                    row[f'Beta_{stock}'] = beta
                results.append(row)
            print(f"\n=== New Combination {combination_id} at {data.index[day].date()} ===")
            print("VN30F1M = " + " + ".join([f"{b:.3f}*{s}" for s, b in selected_betas.items()]))
            

    # Evaluate all active combinations
    for comb in active_combinations[:]:
        if day >= comb['start_day']:
            current_prices = data.iloc[day]
            residual = current_prices[etf] - sum(
                current_prices[s] * b for s, b in comb['betas'].items()
            )
            mean = comb['estimation_residuals'].mean()
            std = comb['estimation_residuals'].std()
            
            # Check threshold condition
            if abs(residual - mean) > std_multiplier * std:
                active_combinations.remove(comb)
                continue
            
            # Check stationarity (optional, requires enough data points)
            # Here, we rely on threshold as a proxy; ADF could be added with a rolling window
            
            # Store results
            row = {
                'Date': data.index[day],
                'Combination_ID': comb['id'],
                'Residual': residual,
                'Total_Combinations': len(active_combinations),
                'Num_Stocks': len(comb['betas'])
            }
            for stock, beta in comb['betas'].items():
                row[f'Beta_{stock}'] = beta
            results.append(row)

# Create results DataFrame
results_df = pd.DataFrame(results)



=== New Combination 1 at 2024-04-02 ===
VN30F1M = 50.022*E1VFVN30 + 7.262*FUESSVFL

=== New Combination 2 at 2024-04-03 ===
VN30F1M = 47.046*E1VFVN30 + 10.915*FUEVN100

=== New Combination 3 at 2024-04-05 ===
VN30F1M = 36.554*E1VFVN30 + 8.885*FUESSVFL + 15.603*FUEVN100

=== New Combination 4 at 2024-04-08 ===
VN30F1M = 34.759*E1VFVN30 + 9.810*FUESSVFL + 17.020*FUEVN100

=== New Combination 5 at 2024-04-09 ===
VN30F1M = 33.909*E1VFVN30 + 11.167*FUESSVFL + 16.631*FUEVN100

=== New Combination 6 at 2024-04-10 ===
VN30F1M = 37.172*E1VFVN30 + 3.374*MBB + 16.958*FUEVN100

=== New Combination 7 at 2024-04-11 ===
VN30F1M = 30.355*E1VFVN30 + 12.299*FUESSVFL + 19.747*FUEVN100

=== New Combination 8 at 2024-04-15 ===
VN30F1M = 30.971*E1VFVN30 + 11.810*FUESSVFL + 19.362*FUEVN100

=== New Combination 9 at 2024-04-16 ===
VN30F1M = 48.846*E1VFVN30 + 7.956*FUESSVFL

=== New Combination 10 at 2024-04-17 ===
VN30F1M = 35.171*E1VFVN30 + 2.860*ACB + 21.319*FUEVN100

=== New Combination 11 at 2024-04-19 =

In [None]:
import pandas as pd
import yfinance as yf
from vnstock import Vnstock
import numpy as np
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from statsmodels.tsa.stattools import adfuller
import os

# Define portfolio
stocks = ['ACB', 'BCM', 'BID', 'BVH', 'CTG', 'FPT', 'GAS', 'GVR', 'HDB', 'HPG', 'LPB', 'MBB', 'MSN', 'MWG',
          'PLX', 'SAB', 'SHB', 'SSI', 'STB', 'TCB', 'TPB', 'VCB', 'VHM', 'VIB', 'VIC', 'VJC', 'VNM', 'VRE',
          'VPB', 'FUEVFVND', 'FUESSVFL', 'E1VFVN30', 'FUEVN100']
etf = 'VN30F1M'
tickers = [etf] + stocks
start_date = '2024-01-01'
end_date = '2024-12-31'
file_path = 'data\\stock_data.csv'

# Check if file exists and load data accordingly
if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    data['Date'] = pd.to_datetime(data['time'])
    data = data.set_index('Date')
else:
    data = get_stock_data(tickers, start_date, end_date, '1D')
    data.to_csv(file_path, index=True)
    data['Date'] = pd.to_datetime(data['time'])
    data = data.set_index('Date')

# Parameters
estimation_window = 60  # Days for estimation
min_trading_days = 30  # Minimum days to trade before checking stationarity
threshold = 0.05  # Minimum coefficient size
max_stocks = 6  # Maximum stocks in combination
confidence_level = 1  # 0=90%, 1=95%, 2=99% for pairwise tests
confidence_level_joh_final = 2  # 99% for final Johansen test
adf_significance = 0.005  # 0.5% significance for ADF test during estimation
adf_significance_trading = 0.05  # 5% significance for ADF test during trading

# Function to select stocks and ensure stationary residuals
def select_stocks(window_data, etf, stocks, threshold, max_stocks):
    data = window_data
    candidates = []

    # Step 1: Pairwise cointegration at 95% confidence
    for stock in stocks:
        try:
            result = coint_johansen(data[[etf, stock]], det_order=0, k_ar_diff=1)
            if result.lr1[0] > result.cvt[0, confidence_level]:
                candidates.append((stock, result.lr1[0]))
        except Exception as e:
            print(f"Pairwise test failed for {stock}: {e}")
            continue

    if not candidates:
        return None

    # Sort by trace statistic
    candidates.sort(key=lambda x: x[1], reverse=True)
    filtered_stocks = [stock for stock, _ in candidates]

    # Step 2: Build combination incrementally
    selected = [candidates[0][0]]
    best_trace_stat = candidates[0][1]

    while len(selected) < max_stocks and filtered_stocks:
        best_next = None
        best_next_trace = 0

        for stock in filtered_stocks:
            if stock not in selected:
                test_subset = selected + [stock]
                try:
                    result = coint_johansen(data[[etf] + test_subset], det_order=0, k_ar_diff=0)
                    if result.lr1[0] > result.cvt[0, confidence_level]:
                        evec = result.evec[:, 0]
                        betas = -evec[1:] / evec[0]
                        if all(beta >= 0 for beta in betas):
                            if result.lr1[0] > best_next_trace:
                                best_next_trace = result.lr1[0]
                                best_next = stock
                except Exception as e:
                    print(f"Combination test failed: {e}")
                    continue

        if best_next and best_next_trace > best_trace_stat:
            selected.append(best_next)
            best_trace_stat = best_next_trace
        else:
            break

    # Final validation with ADF test
    try:
        result = coint_johansen(data[[etf] + selected], det_order=0, k_ar_diff=1)
        if result.lr1[0] > result.cvt[0, confidence_level_joh_final]:
            evec = result.evec[:, 0]
            betas = -evec[1:] / evec[0]
            if all(beta >= 0 for beta in betas):
                selected_betas = {s: b for s, b in zip(selected, betas) if abs(b) > threshold}
                residuals = data[etf] - sum(data[s] * b for s, b in selected_betas.items())
                adf_result = adfuller(residuals)
                if adf_result[1] < adf_significance:  # Stationary at 99.5%
                    return selected_betas
    except Exception as e:
        print(f"Final validation failed: {e}")

    return None

# Rolling strategy with dynamic trading
results = []
active_combinations = []
combination_id = 0

for day in range(estimation_window, len(data)):
    # Check for new combination (one per day)
    estimation_data = data.iloc[day - estimation_window:day]
    if not any(comb['start_day'] == day for comb in active_combinations):
        selected_betas = select_stocks(estimation_data, etf, stocks, threshold, max_stocks)
        if selected_betas:
            combination_id += 1
            estimation_residuals = estimation_data[etf] - sum(
                estimation_data[s] * b for s, b in selected_betas.items()
            )
            active_combinations.append({
                'id': combination_id,
                'betas': selected_betas,
                'start_day': day,
                'estimation_residuals': estimation_residuals,
                'all_residuals': estimation_residuals.tolist(),  # Store all residuals from estimation
                'trading_days': 0  # Track trading days
            })
            # Add estimation residuals to results
            for i, res in enumerate(estimation_residuals):
                row = {
                    'Date': estimation_data.index[i],
                    'Combination_ID': combination_id,
                    'Residual': res,
                    'Total_Combinations': len(active_combinations),
                    'Num_Stocks': len(selected_betas),
                    'Is_Estimation': True
                }
                for stock, beta in selected_betas.items():
                    row[f'Beta_{stock}'] = beta
                results.append(row)
            print(f"\n=== New Combination {combination_id} at {data.index[day].date()} ===")
            print("VN30F1M = " + " + ".join([f"{b:.3f}*{s}" for s, b in selected_betas.items()]))

    # Evaluate all active combinations
    for comb in active_combinations[:]:
        if day >= comb['start_day']:
            comb['trading_days'] += 1
            current_prices = data.iloc[day]
            residual = current_prices[etf] - sum(
                current_prices[s] * b for s, b in comb['betas'].items()
            )
            comb['all_residuals'].append(residual)  # Append current residual to all residuals

            # Check stationarity after minimum trading days
            if comb['trading_days'] >= min_trading_days:
                all_residuals_series = pd.Series(comb['all_residuals'])
                adf_result = adfuller(all_residuals_series, autolag='AIC')
                if adf_result[1] >= adf_significance_trading:  # Not stationary at 95%
                    active_combinations.remove(comb)
                    continue

            # Store trading results
            row = {
                'Date': data.index[day],
                'Combination_ID': comb['id'],
                'Residual': residual,
                'Total_Combinations': len(active_combinations),
                'Num_Stocks': len(comb['betas']),
                'Is_Estimation': False
            }
            for stock, beta in comb['betas'].items():
                row[f'Beta_{stock}'] = beta
            results.append(row)

# Create results DataFrame
results_df = pd.DataFrame(results)



=== New Combination 1 at 2024-04-02 ===
VN30F1M = 50.022*E1VFVN30 + 7.262*FUESSVFL

=== New Combination 2 at 2024-04-03 ===
VN30F1M = 47.046*E1VFVN30 + 10.915*FUEVN100

=== New Combination 3 at 2024-04-05 ===
VN30F1M = 36.554*E1VFVN30 + 8.885*FUESSVFL + 15.603*FUEVN100

=== New Combination 4 at 2024-04-08 ===
VN30F1M = 34.759*E1VFVN30 + 9.810*FUESSVFL + 17.020*FUEVN100

=== New Combination 5 at 2024-04-09 ===
VN30F1M = 33.909*E1VFVN30 + 11.167*FUESSVFL + 16.631*FUEVN100

=== New Combination 6 at 2024-04-10 ===
VN30F1M = 37.172*E1VFVN30 + 3.374*MBB + 16.958*FUEVN100

=== New Combination 7 at 2024-04-11 ===
VN30F1M = 30.355*E1VFVN30 + 12.299*FUESSVFL + 19.747*FUEVN100

=== New Combination 8 at 2024-04-15 ===
VN30F1M = 30.971*E1VFVN30 + 11.810*FUESSVFL + 19.362*FUEVN100

=== New Combination 9 at 2024-04-16 ===
VN30F1M = 48.846*E1VFVN30 + 7.956*FUESSVFL

=== New Combination 10 at 2024-04-17 ===
VN30F1M = 35.171*E1VFVN30 + 2.860*ACB + 21.319*FUEVN100

=== New Combination 11 at 2024-04-19 =

In [80]:
results_df.to_csv('result.csv')