In [1]:
import empyrical as ep
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta
import pyfolio as pf
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import yfinance as yf

import datetime
import warnings
import pytz

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier



In [2]:
# Disable future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [3]:
# Define tickers

tickers = {
    "^GSPC": "S&P 500",
    "^DJI": "Dow Jones Industrial Average",
    "^IXIC": "NASDAQ Composite",
    "^NYA": "NYSE COMPOSITE (DJ)",
    "^XAX": "NYSE AMEX COMPOSITE INDEX",
    "^BUK100P": "Cboe UK 100",
    "^RUT": "Russell 2000",
    "^FTSE": "FTSE 100",
    "^GDAXI": "DAX PERFORMANCE-INDEX",
    "^FCHI": "CAC 40",
    "^STOXX50E": "ESTX 50 PR.EUR",
    "^N100": "Euronext 100 Index",
    "^BFX": "BEL 20",
    "IMOEX.ME": "MOEX Russia Index",
    "^N225": "Nikkei 225",
    "^HSI": "HANG SENG INDEX",
    "000001.SS": "SSE Composite Index",
    "399001.SZ": "Shenzhen Index",
    "^STI": "STI Index",
    "^AXJO": "S&P/ASX 200",
    "^AORD": "ALL ORDINARIES",
    "^BSESN": "S&P BSE SENSEX",
    "^JKSE": "IDX COMPOSITE",
    "^KLSE": "FTSE Bursa Malaysia KLCI",
    "^NZ50": "S&P/NZX 50 INDEX GROSS (GROSS)",
    "^KS11": "KOSPI Composite Index",
    "^TWII": "TSEC weighted index",
    "^GSPTSE": "S&P/TSX Composite index",
    "^BVSP": "IBOVESPA",
    "^MXX": "IPC MEXICO",
    "^MERV": "MERVAL",
    "^TA125.TA": "TA-125",
    "^JN0U.JO": "Top 40 USD Net TRI Index",
    "^SET.BK": "Stock Exchange of Thailand",
    "TDEX.BK": "ThaiDEX SET50"
}

In [4]:
# Function to fetch data for a ticker
def fetch_data(ticker, start_date, end_date):
    dft = yf.Ticker(ticker)
    df = dft.history(interval="1d", start=start_date, end=end_date)
    df['Ticker'] = ticker
    return df

In [6]:

# Define the start and end dates for the data
startDate = "1990-01-01"
endDate = "2024-01-01"

# Create an empty DataFrame to store all data
all_data = pd.DataFrame()

# tickers_list = ["^SET.BK", "AWC.BK"]

showModelScores = False

gridSearch = False

cvTest = False

df = pd.DataFrame()


# Loop through each ticker
for ticker in tickers_list:
    
    
    df = fetch_data(ticker, startDate, endDate)
    dft = yf.Ticker(ticker)
    timeZone = dft.info.get("timeZoneFullName")
    tickerName = dft.info.get("longName", "Unknown Ticker")
    
    print(f"{ticker} : ({tickerName})")
    
    # df
    
    # Drop columns
    df.drop(columns=['Dividends'], inplace=True)
    # Drop Stock Splits column
    df.drop(columns=['Stock Splits'], inplace=True)

    ## Calculate EMA-12 and EMA-26 using Exponential Weighing Average (EWM)
    # df['EMA-12'] = df['Close'].ewm(span = 12, adjust = False).mean()
    # df['EMA-26'] = df['Close'].ewm(span = 26, adjust = False).mean()

    ## Calculate MACD 
    # df['MACD'] = df['EMA-12'] - df['EMA-26']
    df['MACD'] = ta.macd(df['Close'], fast=12, slow=26, signal=9)['MACD_12_26_9']
    df['Pct_Change'] = df['Close'].pct_change() * 100

    ## Calculate RSI using formula
    ## RSI = 100 – [100 ÷ ( 1 + (Average Gain During Up Periods ÷ Average Loss During Down Periods ))]

    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    # Calculate the Exponential Moving Average of gains and losses
    avg_gain = gain.ewm(span=14, min_periods=14).mean()
    avg_loss = loss.ewm(span=14, min_periods=14).mean()

    # Calculate the RS and RSI
    rs = avg_gain / avg_loss
    df['RSI_EMA'] = 100 - (100 / (1 + rs))

    df['RSI_ta'] = ta.rsi(df['Close'], length=14)

    df['MA10'] = df.ta.sma(length=10)
    df['MA50'] = df.ta.sma(length=50)
    df['MA200'] = df.ta.sma(length=200)

    ## Calculate Boilinger Bands
    window = 20
    df['MA20'] = df['Close'].rolling(window=window).mean()
    df['std_dev'] = df['Close'].rolling(window=window).std()
    df['Upper_BB'] = df['MA20'] + (df['std_dev'] * 2)
    df['Lower_BB'] = df['MA20'] - (df['std_dev'] * 2)
    
    # Create target variable: 1 if next day's close is higher than today's, else 0
    df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    
    
    vix = yf.Ticker("^VIX")
    vix_df = vix.history(interval="1d", start=startDate, end=endDate)

    vix_df['Pct_Change'] = vix_df['Close'].pct_change() * 100

    # print(vix_df['Pct_Change'].max())

    # Get the time zone of the stock's exchange
    stock_timezone = pytz.timezone(timeZone)

    # Convert the time zone of the VIX DataFrame to match the time zone of the stock's exchange
    vix_df.index = vix_df.index.tz_convert(stock_timezone)

    # Align the timestamps of the VIX data to match those of your DataFrame
    vix_df = vix_df.reindex(df.index, method='ffill')

    vix_df['AVG'] = ( vix_df['Open'] + vix_df['High'] + vix_df['Low'] + vix_df['Close'] ) / 4
    # Merge VIX close prices into the original DataFrame
    df['VIX'] = vix_df['Close']
   
    df = df.dropna()
        
    # Prepare the feature set and target variable
    X = df[['Pct_Change', 'std_dev', 'MA200', 'MA50', 'MA10', 'MACD', 'RSI_ta', 'VIX']]
    # X = df[['MA200', 'MA50', 'MA10', 'MACD2', 'RSI_ta', 'VIX', 'VIX_35', 'VIX_65']]
    # X = df[['SMA200', 'SMA50', 'SMA10', 'MACD', 'RSI', 'VIX', 'VIX_35', 'VIX_65', 'MA20', 'Lower_BB', 'Upper_BB']]
    y = df['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train the RandomForestClassifier
    model = RandomForestClassifier(
        n_estimators=100, 
        random_state=42, 
        max_depth=5, 
        min_samples_leaf=1, 
        min_samples_split=10
    )

    model.fit(X_train, y_train)

    if showModelScores:
        print("Model 1 : Train score : ", model.score(X_train,y_train))
        print("Model 1 : Test score : ", model.score(X_test,y_test))
        
    if cvTest:
        cv_scores = cross_val_score(model, X, y, cv=5)
    
    if showModelScores:
        print("Model 1 : Cross-validation scores:", cv_scores)
        print("Model 1 : Mean cross-validation score:", cv_scores.mean())

    # Define hyperparameters grid for Random Forest
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Define hyperparameters grid for XGBoost
    xgb_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.01, 0.001, 0.005, 0.0025, 0.00125],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9],
        'gamma': [0.1, 0.2],
        'reg_alpha': [0.1, 0.2],
        'reg_lambda': [0.1, 0.2]
    }

    # Define models
    # rf = RandomForestClassifier(random_state=42)
    # xgb = XGBClassifier(random_state=42)

    rf = RandomForestClassifier(random_state=42, max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=91)
    xgb = XGBClassifier(random_state=42, colsample_bytree=0.8, gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8)

    # Perform GridSearchCV for Random Forest
    if gridSearch:
        rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        rf_grid_search.fit(X_train, y_train)
        best_rf = rf_grid_search.best_estimator_
    else: 
        rf.fit(X_train, y_train)

    # Perform GridSearchCV for XGBoost
    if gridSearch:
        xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        xgb_grid_search.fit(X_train, y_train)
        best_xgb = xgb_grid_search.best_estimator_
    else:
        xgb.fit(X_train, y_train)

    # Create the ensemble with best estimators
    ensemble_model = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb)], voting='soft')
    ensemble_model.fit(X_train, y_train)

    # Evaluate the ensemble
    if showModelScores:
        print("Model 2 : Train score:", ensemble_model.score(X_train, y_train))
        print("Model 2 : Test score:", ensemble_model.score(X_test, y_test))

    # Cross-validation
    
    if cvTest:
        ensemble_cv_scores = cross_val_score(ensemble_model, X, y, cv=5)
    
    if showModelScores:
        print("Model 2 : Cross-validation scores:", ensemble_cv_scores)
        print("Model 2 : Mean cross-validation score:", ensemble_cv_scores.mean())

    df['Signal'] = 0
    # df.loc[X_test.index, 'Signal'] = model.predict(X_test) ## Model 1 Random Forest
    # df.loc[X_test.index, 'Signal'] = ensemble_model1.predict(X_test) ## Model 2 Random Forest + XGBoost
    df.loc[X_test.index, 'Signal'] = ensemble_model.predict(X_test) ## Model 3 More precise Random Forest + XGBoost 

    # Initialize the 'Position' column
    df['Position'] = 0


    ## Iterate through the DataFrame to apply the conditions
    holding_position = False

    start_idx = df.index.get_loc(X_test.index[0])

    ## Condition 1
    # for i in range(1, len(df)): 
    for i in range(start_idx, len(df)):
        if df.loc[df.index[i], 'VIX'] > 40:
            # print(df.index[i], '> 40')
            if df.loc[df.index[i], 'VIX'] > 60 and not holding_position:
                df.loc[df.index[i], 'Position'] = 1
                holding_position = True
            elif df.loc[df.index[i], 'VIX'] > 50 and holding_position:
                df.loc[df.index[i], 'Position'] = 0
                holding_position = False
            elif df.loc[df.index[i], 'Signal'] == 1 and not holding_position and df.loc[df.index[i], 'VIX'] < 50:
            # elif not holding_position and df.loc[df.index[i], 'VIX'] < 50:
                df.loc[df.index[i], 'Position'] = 1
                holding_position = True
        elif (df.loc[df.index[i], 'VIX'] < 20) and holding_position:
            ## or df.loc[df.index[i], 'Signal'] == 0
            df.loc[df.index[i], 'Position'] = 0
            holding_position = False
    
    # print(df[df['Signal'] == 1])
    
    # for i in range(1, len(df)): 
    #     if df.loc[df.index[i], 'VIX'] > 20:
    #     # or df.loc[df.index[i], 'Signal'] == 1:
    #         df.loc[df.index[i], 'Position'] = 1
    #         holding_position = True
    #     elif (df.loc[df.index[i], 'VIX'] < 10 ) and holding_position:
    #         ## or df.loc[df.index[i], 'Signal'] == 0
    #         df.loc[df.index[i], 'Position'] = 0
    #         holding_position = False
            
    # df['Position'] = df['Position'].replace(to_replace=0, method=None, limit=None)
    df.loc[df['Position'] == 0, 'Position'] = np.nan

    # Forward fill NaN values in 'Position'
    df['Position'] = df['Position'].fillna(method='ffill')

    # Replace any remaining NaN values with 0 and ensure integer type
    df['Position'] = df['Position'].fillna(0).astype(int)

    # # Calculate strategy returns
    # df['Strategy_Returns'] = df['Position'].shift(1) * df['Close'].pct_change()

    # # Drop NaN values from returns
    # df.dropna(subset=['Strategy_Returns'], inplace=True)

    # returns = df['Strategy_Returns']
    df_test_period = df.loc[X_test.index[0]:]

    # Create a deep copy of the DataFrame to avoid the warning
    df_test_period = df_test_period.copy()

    # Calculate the strategy returns
    df_test_period.loc[:, 'Strategy_Returns'] = df_test_period['Position'].shift(1) * df_test_period['Close'].pct_change()

    # Drop rows with NaN values in 'Strategy_Returns' column
    df_test_period.dropna(subset=['Strategy_Returns'], inplace=True)

    # Extract the 'Strategy_Returns' column
    returns = df_test_period['Strategy_Returns']
    
    cumulative_returns = (1 + returns).cumprod() - 1
    perf_stats = {
        'Annual Return': ep.annual_return(returns),
        'Cumulative Returns': ep.cum_returns_final(returns),
        'Annual Volatility': ep.annual_volatility(returns),
        'Sharpe Ratio': ep.sharpe_ratio(returns),
        'Sortino Ratio': ep.sortino_ratio(returns),
        'Max Drawdown': ep.max_drawdown(returns),
        'Calmar Ratio': ep.calmar_ratio(returns)
    }

    # res = pd.DataFrame()
    
    # res['Ticker'] = ticker
    
    # for metric, value in perf_stats.items():
    #     # print(f"{metric}: {value}") 
    #     res[metric] = value
    
    # Convert perf_stats to a DataFrame
    perf_df = pd.DataFrame(perf_stats, index=[ticker])
    
    perf_df['Index Name'] = tickerName
    
    perf_df['1_Train'] = model.score(X_train,y_train)
    perf_df['1_Test'] = model.score(X_test,y_test)

    if cvTest: 
        perf_df['1_CV'] = cv_scores.mean()
    
    perf_df['2_Train'] = ensemble_model.score(X_train, y_train)
    perf_df['2_Test'] = ensemble_model.score(X_test, y_test)

    if cvTest:
        perf_df['2_CV'] = ensemble_cv_scores.mean()

    perf_df['Test Date'] = str(X_test.index[0])
    
    perf_df = perf_df[['Index Name'] + [col for col in perf_df.columns if col != 'Index Name']]
    
    # Append to all_data DataFrame
    all_data = pd.concat([all_data, perf_df])

    print(f"{ep.annual_return(returns):.5f} {ep.annual_volatility(returns):.5f} {ep.sharpe_ratio(returns):.5f} {ep.max_drawdown(returns):.5f} ")
    

NameError: name 'tickers_list' is not defined

In [None]:
all_data.dropna(inplace=True)

In [None]:
all_data

In [None]:
# all_data.to_csv("indices_return.csv")
all_data.to_excel("indices_return.xlsx", index=True)