In [12]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------- Locate data ----------
data_path = None
project_root = None
for base in (Path.cwd(), Path.cwd().parent):
    candidate = base / 'src' / 'data' / 'datasets' / 'Core_TimeSeries.csv'
    if candidate.exists():
        data_path = candidate
        project_root = base
        break

if data_path is None:
    raise FileNotFoundError('Could not locate src/data/datasets/Core_TimeSeries.csv')

ts_df = pd.read_csv(data_path)

# ---------- Config ----------
indicators = [
    "Open_Price",
    "Close_Price",
    "High_Price",
    "Low_Price",
    "Volume",
    "Daily_Return_Pct",
    "Volatility_Range",
    "Market_Cap",
    "SMA_20",
    "SMA_50",
    "RSI_14",
]

max_lag = 40  # window for lags: [-max_lag, max_lag]

# Output dirs
project_root = project_root or Path.cwd()
figures_dir = project_root / "results" / "figures"
metrics_dir = project_root / "results" / "metrics"
figures_dir.mkdir(parents=True, exist_ok=True)
metrics_dir.mkdir(parents=True, exist_ok=True)

results = []

# ---------- Helper: cross-correlation ----------
def cross_correlation_stats(x, y, max_lag):
    """
    x_t vs y_{t+k}, k in [-max_lag, max_lag]
    Returns lags_window, cc_window, best_lag, best_corr
    """
    df_pair = pd.DataFrame({"x": x, "y": y}).dropna()
    if len(df_pair) == 0:
        return None, None, None, None

    x_vals = df_pair["x"].to_numpy()
    y_vals = df_pair["y"].to_numpy()

    n = len(df_pair)
    x_vals = x_vals - x_vals.mean()
    y_vals = y_vals - y_vals.mean()

    # raw cross-correlation
    raw_cc = np.correlate(x_vals, y_vals, mode="full")  # length 2n-1
    lags = np.arange(-n + 1, n)

    # normalise to something like Pearson correlation
    denom = n * x_vals.std() * y_vals.std()
    if denom == 0:
        return None, None, None, None
    cc = raw_cc / denom

    # restrict lags
    mask = (lags >= -max_lag) & (lags <= max_lag)
    lags_win = lags[mask]
    cc_win = cc[mask]

    # best lag by absolute correlation
    idx = np.argmax(np.abs(cc_win))
    best_lag = int(lags_win[idx])
    best_corr = float(cc_win[idx])

    return lags_win, cc_win, best_lag, best_corr

# ---------- Loop over indicator pairs ----------
for i in range(len(indicators)):
    for j in range(i + 1, len(indicators)):
        col_x = indicators[i]
        col_y = indicators[j]

        lags_win, cc_win, best_lag, best_corr = cross_correlation_stats(
            ts_df[col_x], ts_df[col_y], max_lag=max_lag
        )

        # Skip pairs that couldn't be computed (e.g. degenerate variance)
        if lags_win is None:
            continue

        # Save numeric result
        results.append(
            {
                "var_x": col_x,
                "var_y": col_y,
                "max_lag_window": max_lag,
                "best_lag": best_lag,
                "best_corr": best_corr,
                "best_abs_corr": abs(best_corr),
            }
        )

        # Plot and save figure
        fig, ax = plt.subplots(figsize=(10, 4))
        ax.stem(lags_win, cc_win)  # no use_line_collection for newer Matplotlib
        ax.axhline(0, linestyle="--")
        ax.set_xlabel(f"Lag ({col_y} relative to {col_x})")
        ax.set_ylabel("Cross-correlation")
        ax.set_title(f"Cross-correlation: {col_x} vs {col_y}")
        fig.tight_layout()

        safe_x = col_x.lower()
        safe_y = col_y.lower()
        fig_path = figures_dir / f"ccf_{safe_x}_vs_{safe_y}.png"
        fig.savefig(fig_path, dpi=150, bbox_inches="tight")
        plt.close(fig)

        print(f"Saved CCF plot for {col_x} vs {col_y} -> {fig_path}")

# ---------- Save metrics ----------
results_df = pd.DataFrame(results)
metrics_path = metrics_dir / "cross_correlation_summary.csv"
results_df.to_csv(metrics_path, index=False)
print(f"\nSaved cross-correlation metrics to {metrics_path}")


Saved CCF plot for Open_Price vs Close_Price -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_close_price.png
Saved CCF plot for Open_Price vs High_Price -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_high_price.png
Saved CCF plot for Open_Price vs Low_Price -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_low_price.png
Saved CCF plot for Open_Price vs Volume -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_volume.png
Saved CCF plot for Open_Price vs Daily_Return_Pct -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_daily_return_pct.png
Saved CCF plot for Open_Price vs Volatility_Range -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_volatility_range.png
Saved CCF plot for Open_Price vs Market_Cap -> /home/benja/SIT746---Research-Project/results/figures/ccf_open_price_vs_market_cap.png
Saved CCF plot for Open_Price vs SMA_20 -> /ho

In [13]:
max_lag = 10
for k in range(1, max_lag + 1):
    rho = ts_df["Daily_Return_Pct"].shift(-k).corr(ts_df["RSI_14"])
    print(f"RSI_14(t) vs Return(t+{k}): {rho:.4f}")


RSI_14(t) vs Return(t+1): -0.0444
RSI_14(t) vs Return(t+2): -0.0371
RSI_14(t) vs Return(t+3): -0.0339
RSI_14(t) vs Return(t+4): -0.0307
RSI_14(t) vs Return(t+5): -0.0307
RSI_14(t) vs Return(t+6): -0.0279
RSI_14(t) vs Return(t+7): -0.0269
RSI_14(t) vs Return(t+8): -0.0272
RSI_14(t) vs Return(t+9): -0.0285
RSI_14(t) vs Return(t+10): -0.0266
