In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Import data

In [2]:
# URLs for the datasets (converted to raw format)
url_econ = 'https://raw.githubusercontent.com/adisorn242/2026_WQU_CapstoneProject/main/ThaiLeadingEconomicIndicators.csv'
url_set = 'https://raw.githubusercontent.com/adisorn242/2026_WQU_CapstoneProject/main/SET_price.csv'

# Download and import the dataframes
econ = pd.read_csv(url_econ)
SET = pd.read_csv(url_set)

In [3]:
# Rename the problematic column 2 in df_econ
econ = econ.rename(columns={econ.columns[2]: 'AuthorizedCapitalofNewlyRegisteredCompanies'})

In [4]:
# 1. Clean SET: Remove header rows and rename columns
SET = SET.drop([0, 1])
SET.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']

# 2. Convert columns from object to float
# We apply pd.to_numeric to ensure all numeric strings are correctly converted
SET[['Close', 'High', 'Low', 'Open', 'Volume']] = SET[['Close', 'High', 'Low', 'Open', 'Volume']].astype(float)

# 3. Set Date as index
SET['Date'] = pd.to_datetime(SET['Date'])
SET.set_index('Date', inplace=True)
SET.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 168 entries, 2012-01-01 to 2025-12-01
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   168 non-null    float64
 1   High    168 non-null    float64
 2   Low     168 non-null    float64
 3   Open    168 non-null    float64
 4   Volume  168 non-null    float64
dtypes: float64(5)
memory usage: 7.9 KB


# Data preparation

## Leading Economic Indicators

In [5]:
econ.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 7 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Date                                         143 non-null    object 
 1   LeadingEconomicIndex                         143 non-null    float64
 2   AuthorizedCapitalofNewlyRegisteredCompanies  143 non-null    float64
 3   ConstructionAreasPermitted                   143 non-null    float64
 4   Exports                                      143 non-null    float64
 5   NumberofForeignTourists                      143 non-null    float64
 6   MoneySupply                                  143 non-null    float64
dtypes: float64(6), object(1)
memory usage: 7.9+ KB


In [6]:
# 1. Set the Date column as the datetime index
df_econ = econ.copy()
df_econ['Date'] = pd.to_datetime(df_econ['Date'])
df_econ.set_index('Date', inplace=True)

In [7]:
# Add the 2025-12-01 row filled with zeros as a container for the shift
df_econ.loc[pd.to_datetime('2025-12-01')] = 0
df_econ = df_econ.sort_index()

# Shift data by 1 period so reporting delay is accounted for
df_econ = df_econ.shift(1)

# Filter to start from Jan 2016
df_econ = df_econ.loc['2016-01-01':]

In [8]:
df_econ.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120 entries, 2016-01-01 to 2025-12-01
Data columns (total 6 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   LeadingEconomicIndex                         120 non-null    float64
 1   AuthorizedCapitalofNewlyRegisteredCompanies  120 non-null    float64
 2   ConstructionAreasPermitted                   120 non-null    float64
 3   Exports                                      120 non-null    float64
 4   NumberofForeignTourists                      120 non-null    float64
 5   MoneySupply                                  120 non-null    float64
dtypes: float64(6)
memory usage: 6.6 KB


# Feature Engineering

## Leading Economic Indicators

In [9]:
# We need data normalization for this one since the raw file columns are show as differnt scale

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the data and reconstruct the DataFrame
# We use df_econ.index and df_econ.columns to keep the metadata intact
df_econ_scaled = pd.DataFrame(
    scaler.fit_transform(df_econ),
    index=df_econ.index,
    columns=df_econ.columns
)

In [10]:
df_econ_scaled.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120 entries, 2016-01-01 to 2025-12-01
Data columns (total 6 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   LeadingEconomicIndex                         120 non-null    float64
 1   AuthorizedCapitalofNewlyRegisteredCompanies  120 non-null    float64
 2   ConstructionAreasPermitted                   120 non-null    float64
 3   Exports                                      120 non-null    float64
 4   NumberofForeignTourists                      120 non-null    float64
 5   MoneySupply                                  120 non-null    float64
dtypes: float64(6)
memory usage: 6.6 KB


## SET Index

In [11]:
# Create a new DataFrame for the log returns
df_SET_ret = pd.DataFrame(index=SET.index)

# Calculate log returns for the SET close price
# Use Open-to-Close to measure tradable return from the start of the month
df_SET_ret['SET_return'] = np.log(SET['Close'] / SET['Open'])

# Calculate log returns for Open-to-High and Open-to-Low using the original 'SET' dataframe
# Formula: ln(Price_A / Price_B)
df_SET_ret['SET_OH_log_ret'] = np.log(SET['High'] / SET['Open'])
df_SET_ret['SET_OL_log_ret'] = np.log(SET['Low'] / SET['Open'])

# Add lag 1 to lag 12 for the 'return' column
for i in range(1, 13):
    df_SET_ret[f'SET_lag_{i}'] = df_SET_ret['SET_return'].shift(i)

# Use both as lag 1 for OH and OL
df_SET_ret['SET_OH_lag_1'] = df_SET_ret['SET_OH_log_ret'].shift(1)
df_SET_ret['SET_OL_lag_1'] = df_SET_ret['SET_OL_log_ret'].shift(1)

# Drop intermediate columns to avoid data leakage in features
df_SET_ret = df_SET_ret.drop(columns=['SET_OH_log_ret', 'SET_OL_log_ret'])

In [12]:
# 1. MACD (12, 26, 9) - Added min_periods for stability
# MACD is traditionally considered stable after ~26-30 periods
ema_12 = SET['Close'].ewm(span=12, adjust=False, min_periods=12).mean()
ema_26 = SET['Close'].ewm(span=26, adjust=False, min_periods=26).mean()
macd_line = ema_12 - ema_26
macd_histogram = macd_line - macd_line.ewm(span=9, adjust=False, min_periods=9).mean()

# 2. RSI (14-period Wilder's)
delta = SET['Close'].diff()
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)
# Wilder's requires 14 periods to start correctly
avg_gain = gain.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
avg_loss = loss.ewm(alpha=1/14, min_periods=14, adjust=False).mean()
rsi = 100 - (100 / (1 + (avg_gain / avg_loss)))

# 3. Money Flow Index (MFI - 14-period)
tp = (SET['High'] + SET['Low'] + SET['Close']) / 3
mf = tp * SET['Volume']
pos_f = (mf.where(tp > tp.shift(1), 0)).rolling(window=14, min_periods=14).sum()
neg_f = (mf.where(tp < tp.shift(1), 0)).rolling(window=14, min_periods=14).sum()
mfi = 100 - (100 / (1 + (pos_f / neg_f)))

# 4. Average True Range (ATR - 14-period)
tr = pd.concat([SET['High'] - SET['Low'],
                abs(SET['High'] - SET['Close'].shift(1)),
                abs(SET['Low'] - SET['Close'].shift(1))], axis=1).max(axis=1)
atr = tr.rolling(window=14, min_periods=14).mean()

# 5. On-Balance Volume (OBV) - Standardized start
obv = (np.sign(SET['Close'].diff()) * SET['Volume']).fillna(0).cumsum()

# 6. Scaling and Lagging
ta_features = np.column_stack([macd_line, macd_histogram, rsi, mfi, atr, obv])
scaler = StandardScaler()
# We must scale only non-NaN rows to prevent mean/std distortion
scaled_ta = np.full(ta_features.shape, np.nan)
mask = ~np.isnan(ta_features).any(axis=1)
scaled_ta[mask] = scaler.fit_transform(ta_features[mask])

# Apply Lag 1 and Merge
df_temp_ta = pd.DataFrame(index=SET.index)
cols = ['SET_MACD_lag_1', 'SET_MACD_Hist_lag_1', 'SET_RSI_lag_1',
        'SET_MFI_lag_1', 'SET_ATR_lag_1', 'SET_OBV_lag_1']

for i, col_name in enumerate(cols):
    df_temp_ta[col_name] = pd.Series(scaled_ta[:, i], index=SET.index).shift(1)

df_SET_ret = df_SET_ret.join(df_temp_ta).dropna()

In [13]:
# Filter the combined DataFrame to start from January 2016
df_SET_ret = df_SET_ret.loc['2016-01-01':]

# Join dataframe

In [14]:
# Create the master feature set by joining SET returns/indicators with scaled econ data
# We use an inner join to ensure rows exist in both datasets
df_feature_sup = df_SET_ret.join(df_econ_scaled, how='inner')
df_feature_sup = df_feature_sup.dropna()

In [15]:
df_feature_sup.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 120 entries, 2016-01-01 to 2025-12-01
Data columns (total 27 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   SET_return                                   120 non-null    float64
 1   SET_lag_1                                    120 non-null    float64
 2   SET_lag_2                                    120 non-null    float64
 3   SET_lag_3                                    120 non-null    float64
 4   SET_lag_4                                    120 non-null    float64
 5   SET_lag_5                                    120 non-null    float64
 6   SET_lag_6                                    120 non-null    float64
 7   SET_lag_7                                    120 non-null    float64
 8   SET_lag_8                                    120 non-null    float64
 9   SET_lag_9                                    120 non-null

In [16]:
# Final export of the master feature set using the DataFrame name as the file name
df_feature_sup.to_csv('df_feature_sup.csv', index=True)