<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Structural Equation Modeling with Holzinger–Swineford (1939)
#
This notebook implements the CFA and multi-group SEM analyses described in the LaTeX report.
It uses the Holzinger–Swineford dataset, a three-factor model (visual, textual, speed), and examines approximate measurement invariance across schools.

In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

# !pip install --upgrade --force-reinstall semopy # Force reinstallation to fix ModuleNotFoundError
from semopy import Model
from semopy.stats import calc_stats
from semopy.scores import calc_scores # This import relies on semopy.scores existing
from semopy.multigroup import multigroup

# Ensure Figures directory exists
FIG_DIR = Path("Figures")
FIG_DIR.mkdir(exist_ok=True)

# A small utility for consistent plot saving
def save_fig(fig, name: str, dpi: int = 300):
    fig.tight_layout()
    fig.savefig(FIG_DIR / name, dpi=dpi, bbox_inches="tight")
    plt.close(fig)

ModuleNotFoundError: No module named 'semopy.scores'

 # 1. Data loading and preprocessing
#
We load a public CSV version of the Holzinger–Swineford dataset and align variable names with the standard `x1`–`x9` convention used in SEM examples.

In [None]:
def load_hs_data() -> pd.DataFrame:

    # Load the Holzinger-Swineford 1939 data from a public GitHub URL and
    # rename key columns to x1..x9 plus school and other background variables.

    # Returns
    # -------
    # pd.DataFrame
    #    Preprocessed dataframe.
    # """
    url = (
        "https://github.com/cddesja/epsy8266/raw/master/"
        "course_materials/data/HolzingerSwineford1939.csv"
    )
    df = pd.read_csv(url)

    # Rename to match the classic lavaan example as closely as possible
    rename_map = {
        "visual": "x1",
        "cubes": "x2",
        "paper": "x3",
        "paragrap": "x4",
        "sentence": "x5",
        "wordm": "x6",
        "addition": "x7",
        "counting": "x8",
        "straight": "x9",
        "agey": "ageyr",
        "gender": "sex"
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    # Keep only variables of interest plus school and background
    cols = ["id", "sex", "ageyr", "school", "grade"] + [f"x{i}" for i in range(1, 10)]
    df = df[cols]

    # Drop rows with missing values on indicators
    df = df.dropna(subset=[f"x{i}" for i in range(1, 10)])

    # Ensure school is categorical
    df["school"] = df["school"].astype("category")

    return df

data = load_hs_data()
data.head()

# 2. Exploratory data analysis (EDA)
 We examine basic summaries and a correlation matrix for the nine indicators.

In [None]:
# Summary statistics
desc = data[[f"x{i}" for i in range(1, 10)]].describe().T
desc

# %%
# Correlation heatmap for x1..x9
corr = data[[f"x{i}" for i in range(1, 10)]].corr()

fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.set_title("Correlation Matrix: Holzinger–Swineford Indicators")
save_fig(fig, "hs_corr_heatmap.png")

# %%
# Histograms of indicators
fig, axes = plt.subplots(3, 3, figsize=(9, 7))
axes = axes.flatten()
for i, ax in enumerate(axes, start=1):
    sns.histplot(data[f"x{i}"], bins=20, kde=False, ax=ax)
    ax.set_title(f"x{i}")
plt.suptitle("Distributions of Cognitive Test Scores", y=1.02)
save_fig(fig, "hs_histograms.png")

# 3. CFA model specification and single-group fit
We use a three-factor CFA model:
#
 ```text
 visual =~ x1 + x2 + x3
 textual =~ x4 + x5 + x6
 speed   =~ x7 + x8 + x9
#
 visual ~~ textual
 visual ~~ speed
 textual ~~ speed

In [None]:
cfa_model_desc = """
visual =~ x1 + x2 + x3
textual =~ x4 + x5 + x6
speed =~ x7 + x8 + x9

visual ~~ textual
visual ~~ speed
textual ~~ speed
"""

model = Model(cfa_model_desc)
res = model.fit(data[[f"x{i}" for i in range(1, 10)]])
estimates = model.inspect(std_est=True)
estimates.head()

# %%
# Fit indices
fit_stats = calc_stats(model)
fit_stats

# %%
# Export fit indices table as an image
fig, ax = plt.subplots(figsize=(4, 1.5))
ax.axis("off")
tbl = ax.table(
    cellText=np.round(fit_stats.values, 4),
    colLabels=fit_stats.columns,
    loc="center"
)
tbl.auto_set_font_size(False)
tbl.set_fontsize(8)
tbl.scale(1.2, 1.2)
save_fig(fig, "hs_fit_indices.png")

# 3.1 Standardized factor loadings

In [None]:
# Extract standardized loadings
loadings = estimates[
    (estimates["op"] == "~") &
    (estimates["rval"].isin(["visual", "textual", "speed"]))
].copy()

loadings = loadings[["lval", "rval", "Estimate", "Std. Err", "z-value", "p-value", "Est. Std"]]
loadings

# %%
# Plot standardized loadings by factor
fig, ax = plt.subplots(figsize=(7, 4))
for factor, group_df in loadings.groupby("rval"):
    ax.bar(
        group_df["lval"],
        group_df["Est. Std"],
        label=factor,
        alpha=0.7
    )

ax.set_ylabel("Standardized loading")
ax.set_title("Standardized Factor Loadings by Latent Ability")
ax.axhline(0, color="black", linewidth=0.8)
ax.legend()
save_fig(fig, "hs_factor_loadings.png")

# 3.2 Factor scores

In [None]:
# Predict factor scores
factor_scores = calc_scores(model, data[[f"x{i}" for i in range(1, 10)]]) # Call calc_scores directly
factor_scores["school"] = data["school"].values

# Scatter: visual vs textual
fig, ax = plt.subplots(figsize=(6, 5))
sns.scatterplot(
    data=factor_scores,
    x="visual",
    y="textual",
    hue="school",
    alpha=0.7,
    ax=ax
)
ax.set_title("Factor Scores: Visual vs Textual by School")
save_fig(fig, "hs_factor_scores.png")

# 4. Multi-group CFA by school
#
We examine configural invariance (same pattern of loadings) and approximate metric
invariance (similar loadings across schools) using group-specific fits.

In [None]:
schools = data["school"].cat.categories.tolist()
group_dfs = {g: data.loc[data["school"] == g, [f"x{i}" for i in range(1, 10)]] for g in schools}

# Fit same model separately by school
group_models = {}
group_loadings = []

for g, df_g in group_dfs.items():
    m_g = Model(cfa_model_desc)
    m_g.fit(df_g)
    est_g = m_g.inspect(std_est=True)
    group_models[g] = m_g

    l_g = est_g[
        (est_g["op"] == "~") &
        (est_g["rval"].isin(["visual", "textual", "speed"]))
    ].copy()
    l_g["school"] = g
    group_loadings.append(l_g)

group_loadings = pd.concat(group_loadings, ignore_index=True)

# %%
# Compare standardized loadings across schools
group_loadings_pivot = group_loadings.pivot_table(
    index=["lval", "rval"],
    columns="school",
    values="Std.Est"
)
group_loadings_pivot

# %%
# Plot multi-group loadings
fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
for ax, factor in zip(axes, ["visual", "textual", "speed"]):
    df_f = group_loadings[group_loadings["rval"] == factor]
    sns.barplot(
        data=df_f,
        x="lval",
        y="Std.Est",
        hue="school",
        ax=ax
    )
    ax.set_title(f"{factor.capitalize()} loadings")
    ax.set_ylim(0, 1.2)

fig.suptitle("Standardized Factor Loadings by School")
save_fig(fig, "hs_multigroup_loadings.png")

 5. Residual diagnostics and approximate modification guidance
#
We inspect residual covariances and standardized residuals to identify potential localized misfit (informally analogous to high modification indices).


In [None]:
# Residual covariance matrix
mx = model.inspect(mode="mx", what="est")
sigma_hat = mx["Sigma"]  # implied covariance matrix
S = np.cov(data[[f"x{i}" for i in range(1, 10)]].T, ddof=1)

resid = S - sigma_hat
resid_df = pd.DataFrame(
    resid,
    index=[f"x{i}" for i in range(1, 10)],
    columns=[f"x{i}" for i in range(1, 10)]
)
resid_df

# %%
# Heatmap of residual covariances
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(resid_df, annot=False, cmap="coolwarm", center=0, ax=ax)
ax.set_title("Residual Covariance Matrix")
save_fig(fig, "hs_residual_covariances.png")

We could flag the largest absolute residuals as candidates for model revision
(e.g., allowing correlated residuals between specific indicators), but we do not
modify the model further in order to avoid overfitting.

# 6. Summary
This notebook implements a three-factor CFA model for the Holzinger–Swineford data,evaluates global fit, inspects factor loadings and scores, and examines approximate measurement invariance across schools. All key figures are saved to the `Figures/` directory for inclusion in the LaTeX report.