In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [8]:
columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

AAPL_prices = pd.read_csv("../data/raw/yf_prices/AAPL.csv")
AMZN_prices = pd.read_csv("../data/raw/yf_prices/AMZN.csv")
BA_prices = pd.read_csv("../data/raw/yf_prices/BA.csv")
CAT_prices = pd.read_csv("../data/raw/yf_prices/CAT.csv")
CVX_prices = pd.read_csv("../data/raw/yf_prices/CVX.csv")
GS_prices = pd.read_csv("../data/raw/yf_prices/GS.csv")
JNJ_prices = pd.read_csv("../data/raw/yf_prices/JNJ.csv")
JPM_prices = pd.read_csv("../data/raw/yf_prices/JPM.csv")
MSFT_prices = pd.read_csv("../data/raw/yf_prices/MSFT.csv")
NEE_prices = pd.read_csv("../data/raw/yf_prices/NEE.csv")
NVDA_prices = pd.read_csv("../data/raw/yf_prices/NVDA.csv")
PFE_prices = pd.read_csv("../data/raw/yf_prices/PFE.csv")
TSLA_prices = pd.read_csv("../data/raw/yf_prices/TSLA.csv")
XOM_prices = pd.read_csv("../data/raw/yf_prices/XOM.csv")

In [11]:
def clean_prices_df(df):
    numeric_cols = ["Close", "High", "Low", "Open", "Volume"]

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

    # Drop rows where 'Close' is missing
    df = df.dropna(subset=["Close"]).reset_index(drop=True)

    return df

In [12]:
AAPL_prices = clean_prices_df(AAPL_prices)
AMZN_prices = clean_prices_df(AMZN_prices)
BA_prices = clean_prices_df(BA_prices)
CAT_prices = clean_prices_df(CAT_prices)
CVX_prices = clean_prices_df(CVX_prices)
GS_prices = clean_prices_df(GS_prices)
JNJ_prices = clean_prices_df(JNJ_prices)
JPM_prices = clean_prices_df(JPM_prices)
MSFT_prices = clean_prices_df(MSFT_prices)
NEE_prices = clean_prices_df(NEE_prices)
NVDA_prices = clean_prices_df(NVDA_prices)
PFE_prices = clean_prices_df(PFE_prices)
TSLA_prices = clean_prices_df(TSLA_prices)
XOM_prices = clean_prices_df(XOM_prices)

In [13]:
AAPL_prices.head()

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2020-01-02,72.796021,72.856613,71.545387,71.799873,135480400.0
1,2020-01-03,72.088295,72.851761,71.862892,72.020432,146322800.0
2,2020-01-06,72.66272,72.7015,70.95401,71.206077,118387200.0
3,2020-01-07,72.320976,72.929322,72.100418,72.672409,108872000.0
4,2020-01-08,73.48436,73.787323,72.022865,72.022865,132079200.0


In [14]:
def feature_engineer_prices_df(df):
    df = df.copy()

    # --- Basic price change features ---
    df["Return_1d"] = df["Close"].pct_change()
    df["Return_7d"] = df["Close"].pct_change(7)

    # --- Moving averages ---
    df["MA_7"] = df["Close"].rolling(window=7).mean()
    df["MA_14"] = df["Close"].rolling(window=14).mean()
    df["MA_30"] = df["Close"].rolling(window=30).mean()

    # --- Volatility ---
    df["Volatility_7d"] = df["Return_1d"].rolling(window=7).std()
    df["Volatility_14d"] = df["Return_1d"].rolling(window=14).std()

    # --- Volume features ---
    df["Volume_MA_7"] = df["Volume"].rolling(window=7).mean()
    df["Volume_Change_1d"] = df["Volume"].pct_change()

    # --- Lag features (previous close prices) ---
    df["Close_Lag_1"] = df["Close"].shift(1)
    df["Close_Lag_7"] = df["Close"].shift(7)

    # --- RSI (Relative Strength Index) ---
    delta = df["Close"].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()

    rs = avg_gain / avg_loss
    df["RSI_14"] = 100 - (100 / (1 + rs))

    # --- MACD (Moving Average Convergence Divergence) ---
    # Fast and Slow Exponential Moving Averages
    df["EMA_12"] = df["Close"].ewm(span=12, adjust=False).mean()
    df["EMA_26"] = df["Close"].ewm(span=26, adjust=False).mean()

    # MACD Line
    df["MACD"] = df["EMA_12"] - df["EMA_26"]

    # Signal Line (9-day EMA of MACD)
    df["MACD_Signal"] = df["MACD"].ewm(span=9, adjust=False).mean()

    # --- Drop very early rows that have NaNs due to rolling windows ---
    df = df.dropna().reset_index(drop=True)

    return df

In [15]:
AAPL_prices = feature_engineer_prices_df(AAPL_prices)
AMZN_prices = feature_engineer_prices_df(AMZN_prices)
BA_prices = feature_engineer_prices_df(BA_prices)
CAT_prices = feature_engineer_prices_df(CAT_prices)
CVX_prices = feature_engineer_prices_df(CVX_prices)
GS_prices = feature_engineer_prices_df(GS_prices)
JNJ_prices = feature_engineer_prices_df(JNJ_prices)
JPM_prices = feature_engineer_prices_df(JPM_prices)
MSFT_prices = feature_engineer_prices_df(MSFT_prices)
NEE_prices = feature_engineer_prices_df(NEE_prices)
NVDA_prices = feature_engineer_prices_df(NVDA_prices)
PFE_prices = feature_engineer_prices_df(PFE_prices)
TSLA_prices = feature_engineer_prices_df(TSLA_prices)
XOM_prices = feature_engineer_prices_df(XOM_prices)

In [17]:
def prepare_data(df, target_col="Close", target_days=7):
    df = df.copy()

    # Define the target (stock price in 'target_days' days ahead)
    df["Target"] = df[target_col].shift(-target_days)

    # Drop the last 'target_days' rows that don't have a future target value
    df = df.dropna(subset=["Target"])

    # Define features (X) and target (y)
    features = df.drop(
        columns=["Target", target_col]
    )  # Drop 'Target' and 'Close' (target)
    target = df["Target"]

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, shuffle=False
    )

    return X_train, X_test, y_train, y_test

In [None]:
def train_xgboost_model(X_train, y_train, X_test, y_test):
    # Initialize XGBoost model
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        colsample_bytree=0.3,
        learning_rate=0.1,
        max_depth=5,
        alpha=10,
        n_estimators=1000,
    )

    # Fit the model to the training data
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate model performance
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error (RMSE): {rmse}")