In [None]:
from pathlib import Path
import numpy  as np
import pandas as pd
import pyreadstat

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor

STU_PATH = Path("../data/raw/Student Data.sav") #path to the student data file
SCH_PATH = Path("../data/raw/School Data.sav") #path to the school data file


# predictor list 

PREDICTORS = [
    #Individual-level Predictors
    "ST004D01T", #Gender
    "AGE", #Age
    "GRADE", #Grade
    "BSMJ", #Expected Occupational Status
    "JOYREAD", #Joy of Reading
    "SCREADCOMP", #Reading Self-Concept: Competence
    "SCREADDIFF", #Reading Self-Concept: Difficulty
    "COMPETE", #Competitiveness
    "WORKMAST", #Work Mastery Orientation
    "GFOFAIL", # General Fear of Failure
    "EUDMO", #Sense of Meaning in Life (Eudaimonia)  
    "RESILIENCE", #Resilience
    "MASTGOAL", #Mastery Goal Orientation

    #Proximal-level Predictors
    "REPEAT", #Grade Repetition History
    "UNDREM", #Meta-cognition: Understanding & Remembering 
    "METASUM", #Meta-cognition: Summarizing
    "METASPAM", #Meta-cognition: Assessing Credibility

    #Microsystem-Level Factors (Family, Peers, & School CLimate)
    "EMOSUPS", #Parental Emotional Support
    "DURECEC", #Duration in Early Childhood Education and Care
    "BELONG", #School Belonging
    "PERCOMP", #Perceived School Competitiveness 
    "PERCOOP", #Perceived School Cooperation
    "ATTLNACT", #Attitudes Towards Learning Activities
    "DISCLIMA", #Disciplinary Climate (Language Lessons)
    "TEACHSUP", #Teacher Support (Language Lessons)
    "DIRINS", #Teacher-Directed Instruction 
    "PERFEED", #Perceived Feedback from Teachers
    "STIMREAD", #Teacher's Stimulation of Reading Engagement
    "ADAPTIVITY", #Adapation of Instruction
    "TEACHINT", #Perceived Teacher Interest 

    #Macrosystem/Exosystem-level Predictors
    "ESCS", #Family Socioeconomic Status(Index)
    "EDUSHORT", #Shortage of Educational Resources
    "RATCMP1", #Number of Computers per Student
    "RATCMP2", #Percentage of Computers Connected to the Internet
]


RAW_ITEMS = ["ST038Q04NA", "ST038Q08NA"]   # verbal & relational items
DV_FLAGS  = ["BULLY_VERBAL", "BULLY_RELATIONAL"]




In [None]:
def prepare_country_df(iso2: str) -> pd.DataFrame:
    """Returns a processed dataframe for the requested country (ISO-3 code)."""

    stu, _ = pyreadstat.read_sav(STU_PATH, apply_value_formats=False)
    sch, _ = pyreadstat.read_sav(SCH_PATH, apply_value_formats=False)

    stu = stu[stu["CNT"] == iso2]
    sch = (sch[sch["CNT"] == iso2]
              [["CNTSCHID", "RATCMP1", "RATCMP2", "EDUSHORT"]])

    df = stu.merge(sch, on="CNTSCHID", how="left")

    keep = PREDICTORS + RAW_ITEMS + ["CNTSTUID"]  # CNTSTUID = student-ID key
    df   = df[keep].copy()

    df["BULLY_VERBAL"] = df["ST038Q04NA"].isin([2, 3]).astype(int)
    df["BULLY_RELATIONAL"] = df["ST038Q08NA"].isin([2, 3]).astype(int)

    df = df.drop(columns=RAW_ITEMS + ["REPEAT"])

    df["ST004D01T"] = df["ST004D01T"].map({1: 0, 2: 1})  

    imputer = SimpleImputer(strategy="median")
    scaler  = StandardScaler()

    features = df.drop(columns=DV_FLAGS + ["CNTSTUID"])
    X_proc   = scaler.fit_transform(imputer.fit_transform(features))
    X_proc   = pd.DataFrame(X_proc, columns=features.columns, index=features.index)

    df_proc  = pd.concat([X_proc, df[DV_FLAGS]], axis=1)
    return df_proc


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

def run_xgb(df: pd.DataFrame, dv_flag: str, country_name: str):
    """
    Trains an XGBoost classifier for a given country and DV,
    prints top 10 feature importances and ROC AUC score.
    """
    X = df.drop(columns=DV_FLAGS)   # DV_FLAGS = list of DV columns
    y = df[dv_flag]

    # Use XGBClassifier for binary DV
    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=4,
        reg_alpha=0.6,
        min_child_weight=5,
        objective="binary:logistic",
        gamma=0,
        random_state=42,
        verbosity=0
    ).fit(X, y)

    # Compute ROC AUC using model's predicted probabilities
    y_proba = model.predict_proba(X)[:, 1]
    roc_auc = roc_auc_score(y, y_proba)

    # Feature importance
    imp = model.feature_importances_
    top = sorted(zip(X.columns, imp), key=lambda t: t[1], reverse=True)[:10]

    print(f"\n=== {country_name} – {dv_flag} ===")
    for rank, (feat, val) in enumerate(top, 1):
        print(f"{rank:>2}. {feat:<12s} : {val:0.4f}")
    print(f"ROC AUC ≈ {roc_auc:0.3f}")


In [5]:
from pathlib import Path
import numpy  as np
import pandas as pd
import pyreadstat

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor

STU_PATH = Path("../data/raw/Student Data.sav") #path to the student data file
SCH_PATH = Path("../data/raw/School Data.sav") #path to the school data file


# predictor list 

PREDICTORS = [
    #Individual-level Predictors
    "ST004D01T", #Gender
    "AGE", #Age
    "GRADE", #Grade
    "BSMJ", #Expected Occupational Status
    "JOYREAD", #Joy of Reading
    "SCREADCOMP", #Reading Self-Concept: Competence
    "SCREADDIFF", #Reading Self-Concept: Difficulty
    "COMPETE", #Competitiveness
    "WORKMAST", #Work Mastery Orientation
    "GFOFAIL", # General Fear of Failure
    "EUDMO", #Sense of Meaning in Life (Eudaimonia)  
    "RESILIENCE", #Resilience
    "MASTGOAL", #Mastery Goal Orientation

    #Proximal-level Predictors
    "REPEAT", #Grade Repetition History
    "UNDREM", #Meta-cognition: Understanding & Remembering 
    "METASUM", #Meta-cognition: Summarizing
    "METASPAM", #Meta-cognition: Assessing Credibility

    #Microsystem-Level Factors (Family, Peers, & School CLimate)
    "EMOSUPS", #Parental Emotional Support
    "DURECEC", #Duration in Early Childhood Education and Care
    "BELONG", #School Belonging
    "PERCOMP", #Perceived School Competitiveness 
    "PERCOOP", #Perceived School Cooperation
    "ATTLNACT", #Attitudes Towards Learning Activities
    "DISCLIMA", #Disciplinary Climate (Language Lessons)
    "TEACHSUP", #Teacher Support (Language Lessons)
    "DIRINS", #Teacher-Directed Instruction 
    "PERFEED", #Perceived Feedback from Teachers
    "STIMREAD", #Teacher's Stimulation of Reading Engagement
    "ADAPTIVITY", #Adapation of Instruction
    "TEACHINT", #Perceived Teacher Interest 

    #Macrosystem/Exosystem-level Predictors
    "ESCS", #Family Socioeconomic Status(Index)
    "EDUSHORT", #Shortage of Educational Resources
    "RATCMP1", #Number of Computers per Student
    "RATCMP2", #Percentage of Computers Connected to the Internet
]


RAW_ITEMS = ["ST038Q04NA", "ST038Q08NA"]   # verbal & relational items
DV_FLAGS  = ["BULLY_VERBAL", "BULLY_RELATIONAL"]




from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd 

def prepare_country_df(iso2: str) -> pd.DataFrame:
    """Returns a processed dataframe for the requested country (ISO-3 code)."""

    stu, _ = pyreadstat.read_sav(STU_PATH, apply_value_formats=False)
    sch, _ = pyreadstat.read_sav(SCH_PATH, apply_value_formats=False)

    stu = stu[stu["CNT"] == iso2]
    sch = (sch[sch["CNT"] == iso2]
              [["CNTSCHID", "RATCMP1", "RATCMP2", "EDUSHORT"]])

    df = stu.merge(sch, on="CNTSCHID", how="left")

    keep = PREDICTORS + RAW_ITEMS + ["CNTSTUID"]  # CNTSTUID = student-ID key
    df   = df[keep].copy()

    df["BULLY_VERBAL"] = df["ST038Q04NA"].isin([2, 3]).astype(int)
    df["BULLY_RELATIONAL"] = df["ST038Q08NA"].isin([2, 3]).astype(int)

    df = df.drop(columns=RAW_ITEMS + ["REPEAT"])

    df["ST004D01T"] = df["ST004D01T"].map({1: 0, 2: 1})  

    imputer = SimpleImputer(strategy="median")
    scaler  = StandardScaler()

    features = df.drop(columns=DV_FLAGS + ["CNTSTUID"])
    X_proc   = scaler.fit_transform(imputer.fit_transform(features))
    X_proc   = pd.DataFrame(X_proc, columns=features.columns, index=features.index)

    df_proc  = pd.concat([X_proc, df[DV_FLAGS]], axis=1)
    return df_proc


def run_xgb(df: pd.DataFrame, dv_flag: str, country_name: str):
    """
    Trains an XGBoost classifier for a given country and DV,
    prints top 10 feature importances and ROC AUC score.
    """
    X = df.drop(columns=DV_FLAGS)   # DV_FLAGS = list of DV columns
    y = df[dv_flag]

    # Use XGBClassifier for binary DV
    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=4,
        reg_alpha=0.6,
        min_child_weight=5,
        objective="binary:logistic",
        gamma=0,
        random_state=42,
        verbosity=0
    ).fit(X, y)

    # Compute ROC AUC using model's predicted probabilities
    y_proba = model.predict_proba(X)[:, 1]
    roc_auc = roc_auc_score(y, y_proba)

    # Feature importance
    imp = model.feature_importances_
    top = sorted(zip(X.columns, imp), key=lambda t: t[1], reverse=True)[:10]

    print(f"\n=== {country_name} – {dv_flag} ===")
    for rank, (feat, val) in enumerate(top, 1):
        print(f"{rank:>2}. {feat:<12s} : {val:0.4f}")
    print(f"ROC AUC ≈ {roc_auc:0.3f}")



jpn = prepare_country_df("JPN")
uk  = prepare_country_df("GBR")

for dv in DV_FLAGS:                      
    run_xgb(jpn, dv, "Japan")           
    run_xgb(uk,  dv, "United Kingdom")   



=== Japan – BULLY_VERBAL ===
 1. BELONG       : 0.0766
 2. METASPAM     : 0.0468
 3. PERCOMP      : 0.0432
 4. ST004D01T    : 0.0391
 5. GFOFAIL      : 0.0369
 6. DISCLIMA     : 0.0361
 7. COMPETE      : 0.0317
 8. JOYREAD      : 0.0317
 9. UNDREM       : 0.0311
10. PERCOOP      : 0.0307
ROC AUC ≈ 0.917

=== United Kingdom – BULLY_VERBAL ===
 1. BELONG       : 0.1011
 2. PERCOOP      : 0.0650
 3. EMOSUPS      : 0.0518
 4. GFOFAIL      : 0.0487
 5. ST004D01T    : 0.0473
 6. PERCOMP      : 0.0444
 7. RESILIENCE   : 0.0404
 8. EUDMO        : 0.0402
 9. ESCS         : 0.0328
10. METASPAM     : 0.0327
ROC AUC ≈ 0.828

=== Japan – BULLY_RELATIONAL ===
 1. BELONG       : 0.0589
 2. GFOFAIL      : 0.0472
 3. DISCLIMA     : 0.0431
 4. PERCOOP      : 0.0387
 5. PERCOMP      : 0.0347
 6. ST004D01T    : 0.0341
 7. ESCS         : 0.0315
 8. BSMJ         : 0.0315
 9. TEACHSUP     : 0.0313
10. EUDMO        : 0.0304
ROC AUC ≈ 0.939

=== United Kingdom – BULLY_RELATIONAL ===
 1. BELONG       : 0.1238


In [6]:
print(jpn.columns)
print(uk.columns)


Index(['ST004D01T', 'AGE', 'GRADE', 'BSMJ', 'JOYREAD', 'SCREADCOMP',
       'SCREADDIFF', 'COMPETE', 'WORKMAST', 'GFOFAIL', 'EUDMO', 'RESILIENCE',
       'MASTGOAL', 'UNDREM', 'METASUM', 'METASPAM', 'EMOSUPS', 'DURECEC',
       'BELONG', 'PERCOMP', 'PERCOOP', 'ATTLNACT', 'DISCLIMA', 'TEACHSUP',
       'DIRINS', 'PERFEED', 'STIMREAD', 'ADAPTIVITY', 'TEACHINT', 'ESCS',
       'EDUSHORT', 'RATCMP1', 'RATCMP2', 'BULLY_VERBAL', 'BULLY_RELATIONAL'],
      dtype='object')
Index(['ST004D01T', 'AGE', 'GRADE', 'BSMJ', 'JOYREAD', 'SCREADCOMP',
       'SCREADDIFF', 'COMPETE', 'WORKMAST', 'GFOFAIL', 'EUDMO', 'RESILIENCE',
       'MASTGOAL', 'UNDREM', 'METASUM', 'METASPAM', 'EMOSUPS', 'DURECEC',
       'BELONG', 'PERCOMP', 'PERCOOP', 'ATTLNACT', 'DISCLIMA', 'TEACHSUP',
       'DIRINS', 'PERFEED', 'STIMREAD', 'ADAPTIVITY', 'TEACHINT', 'ESCS',
       'EDUSHORT', 'RATCMP1', 'RATCMP2', 'BULLY_VERBAL', 'BULLY_RELATIONAL'],
      dtype='object')


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadstat

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

STU_PATH = Path("../data/raw/Student Data.sav")  # student data file
SCH_PATH = Path("../data/raw/School Data.sav")   # school data file

PREDICTORS = [
    "ST004D01T", "AGE", "GRADE", "BSMJ", "JOYREAD", "SCREADCOMP",
    "SCREADDIFF", "COMPETE", "WORKMAST", "GFOFAIL", "EUDMO",
    "RESILIENCE", "MASTGOAL",

    "REPEAT", "UNDREM", "METASUM", "METASPAM",

    "EMOSUPS", "DURECEC", "BELONG", "PERCOMP", "PERCOOP", "ATTLNACT",
    "DISCLIMA", "TEACHSUP", "DIRINS", "PERFEED", "STIMREAD",
    "ADAPTIVITY", "TEACHINT",

    "ESCS", "EDUSHORT", "RATCMP1", "RATCMP2"
]

RAW_ITEMS = ["ST038Q04NA", "ST038Q08NA"]  
DV_FLAGS = ["BULLY_VERBAL", "BULLY_RELATIONAL"]

def prepare_country_df(iso2: str) -> pd.DataFrame:
    """Returns a processed dataframe for the requested country (ISO-3 code)."""
    # Load student & school data
    stu, _ = pyreadstat.read_sav(STU_PATH, apply_value_formats=False)
    sch, _ = pyreadstat.read_sav(SCH_PATH, apply_value_formats=False)

    # Filter for country
    stu = stu[stu["CNT"] == iso2]
    sch = sch[sch["CNT"] == iso2][["CNTSCHID", "RATCMP1", "RATCMP2", "EDUSHORT"]]

    # Merge student + school
    df = stu.merge(sch, on="CNTSCHID", how="left")

    # Keep relevant columns
    keep = PREDICTORS + RAW_ITEMS + ["CNTSTUID"]
    df = df[keep].copy()

    # Create binary bullying variables
    df["BULLY_VERBAL"] = df["ST038Q04NA"].isin([2, 3]).astype(int)
    df["BULLY_RELATIONAL"] = df["ST038Q08NA"].isin([2, 3]).astype(int)

    # Drop raw items & useless column
    df = df.drop(columns=RAW_ITEMS + ["REPEAT"])

    # Encode gender (1=male → 0, 2=female → 1)
    df["ST004D01T"] = df["ST004D01T"].map({1: 0, 2: 1})

    # Impute + scale
    imputer = SimpleImputer(strategy="median")
    scaler = StandardScaler()

    features = df.drop(columns=DV_FLAGS + ["CNTSTUID"])
    X_proc = scaler.fit_transform(imputer.fit_transform(features))
    X_proc = pd.DataFrame(X_proc, columns=features.columns, index=features.index)

    # Combine processed features + DVs
    df_proc = pd.concat([X_proc, df[DV_FLAGS]], axis=1)
    return df_proc

# -------------------------------
# XGBoost model runner
# -------------------------------
def run_xgb(df: pd.DataFrame, dv_flag: str, country_name: str):
    """
    Trains an XGBoost classifier for a given country and DV,
    prints top 10 feature importances and ROC AUC score.
    """
    X = df.drop(columns=DV_FLAGS)
    y = df[dv_flag]

    # Train XGBoost classifier
    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=4,
        reg_alpha=0.6,
        min_child_weight=5,
        objective="binary:logistic",
        gamma=0,
        random_state=42,
        verbosity=0
    ).fit(X, y)

    # Compute ROC AUC on training data
    y_proba = model.predict_proba(X)[:, 1]
    roc_auc = roc_auc_score(y, y_proba)

    # Feature importance
    imp = model.feature_importances_
    top = sorted(zip(X.columns, imp), key=lambda t: t[1], reverse=True)[:10]

    # Print results
    print(f"\n=== {country_name} – {dv_flag} ===")
    for rank, (feat, val) in enumerate(top, 1):
        print(f"{rank:>2}. {feat:<12s} : {val:0.4f}")
    print(f"ROC AUC ≈ {roc_auc:0.3f}")

# -------------------------------
# Run for relational bullying only
# -------------------------------
TARGET_DV = "BULLY_RELATIONAL"

jpn = prepare_country_df("JPN")
uk = prepare_country_df("GBR")

run_xgb(jpn, TARGET_DV, "Japan")
run_xgb(uk, TARGET_DV, "United Kingdom")



=== Japan – BULLY_RELATIONAL ===
 1. BELONG       : 0.0589
 2. GFOFAIL      : 0.0472
 3. DISCLIMA     : 0.0431
 4. PERCOOP      : 0.0387
 5. PERCOMP      : 0.0347
 6. ST004D01T    : 0.0341
 7. ESCS         : 0.0315
 8. BSMJ         : 0.0315
 9. TEACHSUP     : 0.0313
10. EUDMO        : 0.0304
ROC AUC ≈ 0.939

=== United Kingdom – BULLY_RELATIONAL ===
 1. BELONG       : 0.1238
 2. GFOFAIL      : 0.0582
 3. PERCOOP      : 0.0550
 4. EMOSUPS      : 0.0428
 5. PERCOMP      : 0.0364
 6. RESILIENCE   : 0.0329
 7. ST004D01T    : 0.0322
 8. BSMJ         : 0.0297
 9. DISCLIMA     : 0.0295
10. EUDMO        : 0.0290
ROC AUC ≈ 0.842
