In [19]:
import empyrical as ep
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import yfinance as yf

import datetime
import warnings
import pytz

from datetime import datetime

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

plt.style.use("tableau-colorblind10")
%matplotlib inline

In [20]:
# Disable future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

In [21]:
tickers = pd.read_html("https://en.wikipedia.org/wiki/SET50_Index_and_SET100_Index")[0][
    "Symbol"
].values
tickers

array(['ADVANC', 'AOT', 'AWC', 'BANPU', 'BBL', 'BDMS', 'BEM', 'BGRIM',
       'BH', 'BTS', 'CBG', 'CENTEL', 'COM7', 'CPALL', 'CPF', 'CPN', 'CRC',
       'DELTA', 'EA', 'EGCO', 'GLOBAL', 'GPSC', 'GULF', 'HMPRO', 'INTUCH',
       'IVL', 'KBANK', 'KCE', 'KTB', 'KTC', 'LH', 'MINT', 'MTC', 'OR',
       'OSP', 'PTT', 'PTTEP', 'PTTGC', 'RATCH', 'SAWAD', 'SCB', 'SCC',
       'SCGP', 'TISCO', 'TLI', 'TOP', 'TRUE', 'TTB', 'TU', 'WHA'],
      dtype=object)

In [22]:
# Function to fetch data for a ticker
def fetch_data(ticker, start_date, end_date):
    dft = yf.Ticker(ticker)
    df = dft.history(interval="1d", start=start_date, end=end_date)
    df['Ticker'] = ticker
    return df

In [23]:
tickers_list = [ ticker + ".BK" for ticker in tickers ]
tickers_list.insert(0,"TDEX.BK")
tickers_list.insert(0,"^SET.BK")

In [24]:
news = pd.read_csv("counted_news.csv")

In [25]:
news

Unnamed: 0.1,Unnamed: 0,Date,stockList,StockSet,totalCount,maxMentioned,maxCount,3DaySet
0,0,01/01/2024,[],,,,,[]
1,1,02/01/2024,"['AAI', 'AAI', 'AI', 'AI', 'AOT', 'AOT', 'BBL'...","['AAI', 'AI', 'AOT', 'BBL', 'BIG', 'DIF', 'EA'...",40.0,NEW,5.0,"['AAI', 'AI', 'AOT', 'BBL', 'BIG', 'DIF', 'EA'..."
2,2,03/01/2024,"['AAV', 'ADVANC', 'ADVANC', 'ADVANC', 'AI', 'A...","['AAV', 'ADVANC', 'AI', 'ALL', 'AOT', 'AQUA', ...",202.0,KEX,15.0,"['AAI', 'AAV', 'ADVANC', 'AI', 'ALL', 'AOT', '..."
3,3,04/01/2024,"['AAV', 'AAV', 'AAV', 'AEONTS', 'AGE', 'AH', '...","['AAV', 'AEONTS', 'AGE', 'AH', 'AI', 'AIRA', '...",249.0,AOT,9.0,"['AAI', 'AAV', 'ADVANC', 'AEONTS', 'AGE', 'AH'..."
4,4,05/01/2024,"['A5', 'AAV', 'AAV', 'ACG', 'ADVANC', 'AH', 'A...","['A5', 'AAV', 'ACG', 'ADVANC', 'AH', 'AI', 'AI...",291.0,SPRC,17.0,"['A5', 'AAV', 'ACG', 'ADVANC', 'AEONTS', 'AGE'..."
...,...,...,...,...,...,...,...,...
180,180,29/06/2024,"['AAI', 'AAI', 'ADB', 'ADB', 'ADVANC', 'AHC', ...","['AAI', 'ADB', 'ADVANC', 'AHC', 'AI', 'AIT', '...",285.0,EA,49.0,"['24CS', 'A5', 'AAI', 'ACE', 'ADB', 'ADD', 'AD..."
181,181,30/06/2024,"['AI', 'AI', 'AI', 'AI', 'AI', 'AOT', 'AP', 'B...","['AI', 'AOT', 'AP', 'BBGI', 'BDMS', 'BEM', 'BG...",70.0,HEALTH,6.0,"['24CS', 'A5', 'AAI', 'ADB', 'ADD', 'ADVANC', ..."
182,182,01/07/2024,"['ADD', 'ADVANC', 'AI', 'AI', 'AI', 'AI', 'AI'...","['ADD', 'ADVANC', 'AI', 'ALL', 'BGRIM', 'BTC',...",57.0,AI,5.0,"['AAI', 'ADB', 'ADD', 'ADVANC', 'AHC', 'AI', '..."
183,183,02/07/2024,"['ADD', 'ADD', 'ADVANC', 'ADVANC', 'ADVANC', '...","['ADD', 'ADVANC', 'ADVICE', 'AEONTS', 'AI', 'A...",310.0,EA,31.0,"['ADD', 'ADVANC', 'ADVICE', 'AEONTS', 'AI', 'A..."


In [26]:
# Define the start and end dates for the data
startDate = "2016-06-20"
endDate = "2024-06-20"

# Create an empty DataFrame to store all data
all_data = pd.DataFrame()

# tickers_list = ["^SET.BK", "AWC.BK"]

showModelScores = False

gridSearch = False

cvTest = False

df = pd.DataFrame()

# Loop through each ticker
for ticker in tickers_list:

    df = fetch_data(ticker, startDate, endDate)
    dft = yf.Ticker(ticker)
    timeZone = dft.info.get("timeZoneFullName")
    tickerName = dft.info.get("longName", "Unknown Ticker")

    # Drop columns
    df.drop(columns=['Dividends'], inplace=True)
    # Drop Stock Splits column
    df.drop(columns=['Stock Splits'], inplace=True)

    ## Calculate EMA-12 and EMA-26 using Exponential Weighing Average (EWM)
    # df['EMA-12'] = df['Close'].ewm(span = 12, adjust = False).mean()
    # df['EMA-26'] = df['Close'].ewm(span = 26, adjust = False).mean()

    ## Calculate MACD
    # df['MACD'] = df['EMA-12'] - df['EMA-26']
    df['MACD'] = ta.macd(df['Close'], fast=12, slow=26, signal=9)['MACD_12_26_9']
    df['Pct_Change'] = df['Close'].pct_change() * 100

    ## Calculate RSI using formula
    ## RSI = 100 – [100 ÷ ( 1 + (Average Gain During Up Periods ÷ Average Loss During Down Periods ))]

    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    # Calculate the Exponential Moving Average of gains and losses
    avg_gain = gain.ewm(span=14, min_periods=14).mean()
    avg_loss = loss.ewm(span=14, min_periods=14).mean()

    # Calculate the RS and RSI
    rs = avg_gain / avg_loss
    df['RSI_EMA'] = 100 - (100 / (1 + rs))

    df['RSI_ta'] = ta.rsi(df['Close'], length=14)

    df['MA10'] = df.ta.sma(length=10)
    df['MA50'] = df.ta.sma(length=50)
    df['MA200'] = df.ta.sma(length=200)

    ## Calculate Boilinger Bands
    window = 20
    df['MA20'] = df['Close'].rolling(window=window).mean()
    df['std_dev'] = df['Close'].rolling(window=window).std()
    df['Upper_BB'] = df['MA20'] + (df['std_dev'] * 2)
    df['Lower_BB'] = df['MA20'] - (df['std_dev'] * 2)

    # Create target variable: 1 if next day's close is higher than today's, else 0
    df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

    baro = pd.read_csv('sentiment_score.csv')
    baro['Close'] = baro['Baro_Index']
    baro['BARO-EMA2'] = baro.ta.ema(length=2)
    baro['BARO-EMA3'] = baro.ta.ema(length=3)
    baro['BARO-EMA10'] = baro.ta.ema(length=10)
    baro['BARO-EMA50'] = baro.ta.ema(length=50)
    baro['BARO-EMA200'] = baro.ta.ema(length=200)

    baro['BARO-SMA2'] = baro.ta.sma(length=2)
    baro['Date'] = pd.to_datetime(baro['CreateDate'], format='%d/%m/%Y')
    # Sort the DataFrame by CreateDate
    baro_sorted = baro.sort_values(by='Date')
    # Reset the index if needed
    baro_sorted = baro_sorted.reset_index(drop=True)
    baro_sorted.set_index('Date', inplace=True)
    baro_sorted.index = baro_sorted.index.tz_localize('UTC').tz_convert('Asia/Bangkok')
    baro_sorted.index = baro_sorted.index.normalize()

    df['Baro'] = baro_sorted['Close']
    df['Baro-2'] = baro_sorted['BARO-EMA2']
    df['Baro-S2'] = baro_sorted['BARO-SMA2']
    # df['Baro-10'] = baro_sorted['BARO-EMA10']
    # df['Baro-50'] = baro_sorted['BARO-EMA50']
    # df['Baro-200'] = baro_sorted['BARO-EMA200']

    df = df.dropna()

    # Prepare the feature set and target variable
    X = df[['Pct_Change', 'std_dev', 'MA200', 'MA50', 'MA10', 'MACD', 'RSI_ta', 'Baro']]
    # X = df[['MA200', 'MA50', 'MA10', 'MACD2', 'RSI_ta', 'VIX', 'VIX_35', 'VIX_65']]
    # X = df[['SMA200', 'SMA50', 'SMA10', 'MACD', 'RSI', 'VIX', 'VIX_35', 'VIX_65', 'MA20', 'Lower_BB', 'Upper_BB']]
    y = df['Target']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    rf = RandomForestClassifier (
        random_state=42, 
        max_depth=5, 
        min_samples_leaf=1, 
        min_samples_split=5, 
        n_estimators=91
    )

    xgb = XGBClassifier (
        random_state=42, 
        colsample_bytree=0.8, 
        gamma=0.1, 
        learning_rate=0.001, 
        max_depth=3, 
        n_estimators=100, 
        reg_alpha=0.1, 
        reg_lambda=0.1, 
        subsample=0.8
    )

    model = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb)], voting='soft')
    model.fit(X_train, y_train)

    if showModelScores:
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy of the model: {acc:.4f}")

        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot()
        plt.show()

    BaroSD = 13.14880598
    BaroMean = 54.67908753
    BaroCount = 1885

    Baro_MINUS_05_SD = BaroMean - ( 0.5 * BaroSD )
    Baro_MINUS_1_SD = BaroMean - ( 1 * BaroSD )
    Baro_MINUS_15_SD = BaroMean - ( 1.5 * BaroSD )
    Baro_MINUS_2_SD = BaroMean - ( 2 * BaroSD )

    Baro_PLUS_05_SD = BaroMean + ( 0.5 * BaroSD )
    Baro_PLUS_1_SD = BaroMean + ( 1 * BaroSD )
    Baro_PLUS_15_SD = BaroMean + ( 1.5 * BaroSD )
    Baro_PLUS_2_SD = BaroMean + ( 2 * BaroSD )

    # print(f"{Baro_MINUS_1_SD:.3f} | {Baro_PLUS_1_SD:.3f}")
    # print(f"{Baro_MINUS_15_SD:.3f} | {Baro_PLUS_15_SD:.3f}")
    # print(f"{Baro_MINUS_2_SD:.3f} | {Baro_PLUS_2_SD:.3f}")

    df['Signal'] = 0
    # df.loc[X_test.index, 'Signal'] = model.predict(X_test) ## Model 1 Random Forest
    # df.loc[X_test.index, 'Signal'] = ensemble_model1.predict(X_test) ## Model 2 Random Forest + XGBoost
    df.loc[X_test.index, 'Signal'] = model.predict(X_test) ## Model 3 More precise Random Forest + XGBoost 

    # Initialize the 'Position' column
    df['Position'] = 0

    holding_position = False

    order_list = []

    start_idx = df.index.get_loc(X_test.index[0]) ## CHANGE TO X_TEST

    ol = []
    cl = []

    date_list = news["Date"].tolist()

    for i in range(start_idx, len(df)):

        curDate = str(df.index[i].date())

        stockList = []

        date_obj = datetime.strptime(curDate, "%Y-%m-%d")
        curDateDDMMYYYY = date_obj.strftime("%d/%m/%Y")

        curData = news[news["Date"] == curDateDDMMYYYY]

        if curDateDDMMYYYY in date_list:
            # stockList = curData["stockList"].iloc[0]
            stockList = curData["3DaySet"].iloc[0]

        if ticker.replace("^", "")[:-3] not in stockList:
            continue
        elif (
            (df.loc[df.index[i], "Baro-S2"] >= Baro_PLUS_1_SD)
            and not holding_position
        ):
            df.loc[df.index[i], 'Position'] = 1
            holding_position = True
            ol.append(df.index[i])
        elif (
            (df.loc[df.index[i], "Baro-S2"] <= Baro_MINUS_1_SD)
            and holding_position
        ):
            df.loc[df.index[i], 'Position'] = 0
            holding_position = False
            cl.append(df.index[i])

    df.loc[df['Position'] == 0, 'Position'] = np.nan
    df['Position'] = df['Position'].fillna(method='ffill')
    df['Position'] = df['Position'].fillna(0).astype(int)

    df_test_period = df.loc[X_test.index[0]:] ## CHANGE TO X_TEST
    df_test_period = df_test_period.copy()
    df_test_period.loc[:, 'Strategy_Returns'] = df_test_period['Position'].shift(1) * df_test_period['Close'].pct_change()
    df_test_period.dropna(subset=['Strategy_Returns'], inplace=True)

    returns = df_test_period['Strategy_Returns']

    cumulative_returns = (1 + returns).cumprod() - 1
    perf_stats = {
        'Annual Return': ep.annual_return(returns),
        'Cumulative Returns': ep.cum_returns_final(returns),
        'Annual Volatility': ep.annual_volatility(returns),
        'Sharpe Ratio': ep.sharpe_ratio(returns),
        'Sortino Ratio': ep.sortino_ratio(returns),
        'Max Drawdown': ep.max_drawdown(returns),
        'Calmar Ratio': ep.calmar_ratio(returns)
    }

    perf_df = pd.DataFrame(perf_stats, index=[ticker])

    perf_df['Index Name'] = tickerName

    perf_df['1_Train'] = model.score(X_train,y_train)
    perf_df['1_Test'] = model.score(X_test,y_test)

    perf_df['NumOfOrders'] = len(ol)

    perf_df["Test Date"] = str(X_test.index[0])  ## CHANGE TO X_TEST

    perf_df = perf_df[['Index Name'] + [col for col in perf_df.columns if col != 'Index Name']]

    # Append to all_data DataFrame
    all_data = pd.concat([all_data, perf_df])

    print(f"{ticker} | Train : {model.score(X_train, y_train):.4f} | Test : {model.score(X_test, y_test):.4f}")
    print(f"{ep.annual_return(returns):.5f} {ep.annual_volatility(returns):.5f} {ep.sharpe_ratio(returns):.5f} {ep.max_drawdown(returns):.5f}")

^SET.BK | Train : 0.6773 | Test : 0.5087
-0.04294 0.04856 -0.87940 -0.07550
TDEX.BK | Train : 0.6453 | Test : 0.5768
0.00000 0.00000 nan 0.00000
ADVANC.BK | Train : 0.6068 | Test : 0.6087
-0.02456 0.08452 -0.25246 -0.09813
AOT.BK | Train : 0.6301 | Test : 0.6290
-0.03230 0.07681 -0.38899 -0.10448
AWC.BK | Train : 0.6260 | Test : 0.6393
-0.18560 0.19990 -0.92786 -0.22660
BANPU.BK | Train : 0.6301 | Test : 0.6783
-0.16827 0.16605 -1.02564 -0.22237
BBL.BK | Train : 0.6112 | Test : 0.5971
-0.01943 0.07118 -0.24016 -0.06001
BDMS.BK | Train : 0.6308 | Test : 0.6957
-0.01736 0.10290 -0.11943 -0.09715
BEM.BK | Train : 0.6395 | Test : 0.6232
0.08304 0.11609 0.74517 -0.10714
BGRIM.BK | Train : 0.6151 | Test : 0.6164
-0.23252 0.17537 -1.41952 -0.29481
BH.BK | Train : 0.5850 | Test : 0.5942
0.04799 0.11355 0.46648 -0.07364
BTS.BK | Train : 0.6592 | Test : 0.6261
-0.16146 0.25283 -0.55691 -0.27559
CBG.BK | Train : 0.6126 | Test : 0.6029
-0.09602 0.14787 -0.60835 -0.18505
CENTEL.BK | Train : 0.6025 

In [27]:
df.head(4)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ticker,MACD,Pct_Change,RSI,RSI_EMA,...,MA20,std_dev,Upper_BB,Lower_BB,Target,Baro,Baro-2,Baro-S2,Signal,Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-11 00:00:00+07:00,2.337085,2.38232,2.322008,2.337085,47381000,WHA.BK,0.01181,-0.64103,64.285634,41.49843,...,2.333316,0.049858,2.433033,2.233599,0,69.4,58.074206,47.415,0,0
2017-04-12 00:00:00+07:00,2.337085,2.367242,2.337085,2.337085,19199800,WHA.BK,0.009603,0.0,69.230779,41.49843,...,2.337085,0.046922,2.430929,2.243242,0,42.66,47.798069,56.03,0,0
2017-04-17 00:00:00+07:00,2.352163,2.352163,2.322007,2.322007,30437200,WHA.BK,0.006561,-0.645176,54.545389,34.383859,...,2.340101,0.043645,2.427392,2.25281,1,72.11,64.006023,57.385,0,0
2017-04-18 00:00:00+07:00,2.337086,2.367242,2.337086,2.352164,32775600,WHA.BK,0.006508,1.29872,44.444542,52.984643,...,2.343117,0.042196,2.427508,2.258725,0,53.0,56.668674,62.555,0,0


In [28]:
df.head(4)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ticker,MACD,Pct_Change,RSI,RSI_EMA,...,MA20,std_dev,Upper_BB,Lower_BB,Target,Baro,Baro-2,Baro-S2,Signal,Position
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-11 00:00:00+07:00,2.337085,2.38232,2.322008,2.337085,47381000,WHA.BK,0.01181,-0.64103,64.285634,41.49843,...,2.333316,0.049858,2.433033,2.233599,0,69.4,58.074206,47.415,0,0
2017-04-12 00:00:00+07:00,2.337085,2.367242,2.337085,2.337085,19199800,WHA.BK,0.009603,0.0,69.230779,41.49843,...,2.337085,0.046922,2.430929,2.243242,0,42.66,47.798069,56.03,0,0
2017-04-17 00:00:00+07:00,2.352163,2.352163,2.322007,2.322007,30437200,WHA.BK,0.006561,-0.645176,54.545389,34.383859,...,2.340101,0.043645,2.427392,2.25281,1,72.11,64.006023,57.385,0,0
2017-04-18 00:00:00+07:00,2.337086,2.367242,2.337086,2.352164,32775600,WHA.BK,0.006508,1.29872,44.444542,52.984643,...,2.343117,0.042196,2.427508,2.258725,0,53.0,56.668674,62.555,0,0


In [29]:
all_data.dropna(inplace=True)

In [30]:
all_data.head(4)

Unnamed: 0,Index Name,Annual Return,Cumulative Returns,Annual Volatility,Sharpe Ratio,Sortino Ratio,Max Drawdown,Calmar Ratio,1_Train,1_Test,NumOfOrders,Test Date
^SET.BK,SET_SET Index,-0.042939,-0.057988,0.048563,-0.879399,-1.141807,-0.075495,-0.568769,0.677349,0.508721,1,2022-12-22 00:00:00+07:00
ADVANC.BK,Advanced Info Service Public Company Limited,-0.024562,-0.033378,0.084522,-0.252458,-0.407996,-0.098128,-0.250306,0.606831,0.608696,1,2022-12-22 00:00:00+07:00
AOT.BK,Airports of Thailand Public Company Limited,-0.032296,-0.043825,0.076807,-0.38899,-0.509851,-0.104478,-0.309117,0.630087,0.628986,1,2022-12-22 00:00:00+07:00
AWC.BK,Asset World Corp Public Company Limited,-0.185598,-0.137804,0.1999,-0.927856,-1.32228,-0.226604,-0.819039,0.626027,0.639344,1,2023-08-25 00:00:00+07:00


In [31]:
averages = {
    col: all_data[col].mean()
    for col in [
        "Annual Return",
        "Cumulative Returns",
        "Annual Volatility",
        "Sharpe Ratio",
        "1_Test",
        "NumOfOrders",
    ]
}

max_key_length = max(len(key) for key in averages.keys())

for key, value in averages.items():
    print(f"{key.ljust(max_key_length)} : {' 'if value >= 0 else''}{value:.4f}")

Annual Return      : -0.0479
Cumulative Returns : -0.0508
Annual Volatility  :  0.1299
Sharpe Ratio       : -0.2976
1_Test             :  0.6185
NumOfOrders        :  1.0000


In [32]:
# all_data.to_csv("set50_return.csv")
all_data.to_excel("BaroSET50fl.xlsx", index=True)

In [33]:
df2 = all_data

In [34]:
Q1 = df2['Annual Return'].quantile(0.25)
Q3 = df2['Annual Return'].quantile(0.75)

# Compute the IQR (Interquartile Range)
IQR = Q3 - Q1

# Define lower and upper bound
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_filtered = df2

# Filter the DataFrame to remove outliers
df_filtered = df2[(df2['Annual Return'] >= lower_bound) & (df2['Annual Return'] <= upper_bound)]

In [35]:
# df_filtered.to_excel("FilteredSET100wh.xlsx", index=True)