<h3>Forex Prediction Project

<h4>Modelling 2: Gradient Boosting with XGBoost

In [4]:
import pandas as pd
from data_functions import *

In [5]:
data,currency_dict,countries = load_and_process_forex_data()

In [6]:
data.head()

Unnamed: 0,Date,EURO/US$,UNITED KINGDOM POUND/US$,YEN/US$,YUAN/US$,AUSTRALIAN DOLLAR/US$
0,2000-01-03,0.9847,0.6146,101.7,8.2798,1.5172
1,2000-01-04,0.97,0.6109,103.09,8.2799,1.5239
2,2000-01-05,0.9676,0.6092,103.77,8.2798,1.5267
3,2000-01-06,0.9686,0.607,105.19,8.2797,1.5291
4,2000-01-07,0.9714,0.6104,105.17,8.2794,1.5272


In [7]:
currency_dict

{'EURO AREA': 'EURO/US$',
 'UNITED KINGDOM': 'UNITED KINGDOM POUND/US$',
 'JAPAN': 'YEN/US$',
 'CHINA': 'YUAN/US$',
 'AUSTRALIA': 'AUSTRALIAN DOLLAR/US$'}

In [8]:
countries

['EURO AREA', 'UNITED KINGDOM', 'JAPAN', 'CHINA', 'AUSTRALIA']

In [9]:
def extract_currency_data(data, currency):
    """
    Extract a specific country's data from the main dataframe.
    
    Parameters:
    data (DataFrame): The main dataframe containing all countries' data
    country_name (str): The name of the country to extract data for
    
    Returns:
    DataFrame: A dataframe containing only the specified country's data with date column
    """
    country_data = data[['Date',currency]].copy()
    
    # Ensure date column is included and properly formatted
    
    return country_data

In [10]:
# Extract all the currencies from data using extract_currency_data() function in a loop
raw_gradient_data = {}
for country in countries:
    raw_gradient_data[currency_dict[country]] = extract_currency_data(data,currency_dict[country])


In [11]:
# As a function:
def create_data_dict_currency(data,countries,currency_dict):

    data_dict = {}
    for country in countries:
        data_dict[currency_dict[country]] = extract_currency_data(data,currency_dict[country])
    
    return data_dict

<h5>Preparing Data for Gradient Boosting Modelling

In [12]:
# We need to do similar data prep to ARIMA modelling for this:
# - Create returns / log_returns (to keep the data stationary)

# With some additional processing:
# - Create lagging features. LAGS = [1, 2, 3, 5, 10, 20, 30, 60, n], where n is the price n days ago...
# - Create time features (days, weeks, months)
# - Add rolling statistics e.g. rolling averages
# - Add forecasting column (target) (will be 60 in our case)
# - Train-test split (no randomness)

# Take time with this to make sure the data is properly processed, as there are lots of things being added ready for model-input.

In [13]:
# Processing function:
def prepare_single_time_series_indexed_xgboost(
    df: pd.DataFrame,
    currency: str,
    horizon: int = 60, #Test set size (in rows)
    lags: tuple = (1, 2, 3, 5, 7, 14, 21, 28, 35, 42, 49, 56),
    rolling_windows: tuple = (7, 14, 28, 56),
) -> dict:
    """
    Prepare a univariate daily time series for XGBoost training using an index-based approach on a single currency.
    Non-random splitting used

    df = [Date/Index, Price/Currency]
    currency = 'EURO/US$','UNITED KINGDOM POUND/US$' etc
            

    Assumptions:
      - Each row is separated by exactly one day
      - Data is already sorted chronologically
      - Model predicts next-day price (t+1)
      - 60-day forecast achieved via recursive prediction

    Train-test split:
      - Last `horizon` rows used as test set

    Returns:
      dict containing:
        X_train, y_train, X_test, y_test, feature_cols, df_features
    """

    if currency not in df.columns:
        raise ValueError(f"Column '{currency}' not found in dataframe.")

    data = df[[currency]].copy().reset_index(drop=True)

    # Create simple integer time index
    data["t"] = np.arange(len(data))

    # Target: next-day price
    data["y"] = data[currency].shift(-1)

    # Lag features
    for lag in lags:
        data[f"lag_{lag}"] = data[currency].shift(lag)

    # Rolling statistics (past-only)
    shifted = data[currency].shift(1)
    for w in rolling_windows:
        data[f"roll_mean_{w}"] = shifted.rolling(w).mean()
        data[f"roll_std_{w}"]  = shifted.rolling(w).std()
        data[f"roll_min_{w}"]  = shifted.rolling(w).min()
        data[f"roll_max_{w}"]  = shifted.rolling(w).max()

    # Momentum features
    data["diff_1"] = data[currency].diff(1)
    data["pct_change_1"] = data[currency].pct_change(1)

    # Feature columns (exclude raw price + target)
    exclude = {currency, "y"}
    feature_cols = [c for c in data.columns if c not in exclude]

    # Drop rows with NaNs caused by shifting/rolling
    data_model = data.dropna().copy()

    if len(data_model) <= horizon:
        raise ValueError("Not enough data after feature creation for the chosen horizon.")

    # Time-based split (no shuffling)
    train_df = data_model.iloc[:-horizon]
    test_df = data_model.iloc[-horizon:]

    X_train = train_df[feature_cols]
    y_train = train_df["y"]
    X_test = test_df[feature_cols]
    y_test = test_df["y"]

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
        "feature_cols": feature_cols,
        "df_features": data_model
    }

In [14]:
# Transform all currency dataframes using prepare_xgb_time_series_indexed
processed_xgb_data = {}

for currency, df in raw_gradient_data.items():
    # Get the currency column name (second column after Date)
    price_col = df.columns[1]
    
    # Apply the XGBoost preparation function
    processed_xgb_data[currency] = prepare_single_time_series_indexed_xgboost(
        df=df,
        currency=price_col,
        horizon=60,
        lags=(1, 2, 3, 5, 7, 14, 21, 28, 35, 42, 49, 56),
        rolling_windows=(7, 14, 28, 56)
    )

In [15]:
# Above cell as a function:
def process_all_xgboost(raw_gradient_data):
    """
    Apply the prepare_single_time_series_indexed_xgboost() function to a dictionary of datframes containing all 
    currencies (created from create_data_dict_currency() function, with structure {currency:df}
    """
    processed_xgb_data = {}

    for currency, df in raw_gradient_data.items():
        # Get the currency column name (second column after Date)
        price_col = df.columns[1]
        
        # Apply the XGBoost preparation function (for a single currency)
        processed_xgb_data[currency] = prepare_single_time_series_indexed_xgboost(
            df=df,
            currency=price_col,
            horizon=60,
            lags=(1, 2, 3, 5, 7, 14, 21, 28, 35, 42, 49, 56),
            rolling_windows=(7, 14, 28, 56)
        )
    
    return processed_xgb_data


In [16]:
processed_xgb_data = process_all_xgboost(raw_gradient_data)

In [17]:
processed_xgb_data

{'EURO/US$': {'X_train':          t   lag_1   lag_2   lag_3   lag_5   lag_7  lag_14  lag_21  lag_28  \
  56      56  1.0408  1.0306  1.0299  1.0299  1.0369  1.0397  0.9983  1.0155   
  57      57  1.0319  1.0408  1.0306  1.0316  1.0314  1.0413  1.0069  1.0222   
  58      58  1.0284  1.0319  1.0408  1.0299  1.0299  1.0460  1.0243  1.0169   
  59      59  1.0368  1.0284  1.0319  1.0306  1.0316  1.0443  1.0342  1.0161   
  60      60  1.0401  1.0368  1.0284  1.0408  1.0299  1.0326  1.0370  1.0139   
  ...    ...     ...     ...     ...     ...     ...     ...     ...     ...   
  4953  4953  0.9092  0.9098  0.9088  0.9041  0.9086  0.9060  0.8999  0.9004   
  4954  4954  0.9133  0.9092  0.9098  0.9053  0.9039  0.9056  0.9014  0.9016   
  4955  4955  0.9142  0.9133  0.9092  0.9088  0.9041  0.9045  0.9022  0.9012   
  4956  4956  0.9139  0.9142  0.9133  0.9098  0.9053  0.9056  0.9042  0.9016   
  4957  4957  0.9170  0.9139  0.9142  0.9092  0.9088  0.9090  0.9100  0.9011   
  
        lag_35

In [18]:
# Looks good.

Model Training:

In [19]:
from xgboost import XGBRegressor

test_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror"
)