In [None]:
from pathlib import Path
import pandas as pd
import pyreadstat

STU_PATH = Path("../data/raw/Student Data.sav") #path to the student data file
SCH_PATH = Path("../data/raw/School Data.sav") #path to the school data file

PREDICTORS = [
    #Individual-level Predictors
    "ST004D01T", #Gender
    "AGE", #Age
    "GRADE", #Grade
    "BSMJ", #Expected Occupational Status
    "JOYREAD", #Joy of Reading
    "SCREADCOMP", #Reading Self-Concept: Competence
    "SCREADDIFF", #Reading Self-Concept: Difficulty
    "COMPETE", #Competitiveness
    "WORKMAST", #Work Mastery Orientation
    "GFOFAIL", # General Fear of Failure
    "EUDMO", #Sense of Meaning in Life (Eudaimonia)  
    "RESILIENCE", #Resilience
    "MASTGOAL", #Mastery Goal Orientation

    #Proximal-level Predictors
    "REPEAT", #Grade Repetition History
    "UNDREM", #Meta-cognition: Understanding & Remembering 
    "METASUM", #Meta-cognition: Summarizing
    "METASPAM", #Meta-cognition: Assessing Credibility

    #Microsystem-Level Factors (Family, Peers, & School CLimate)
    "EMOSUPS", #Parental Emotional Support
    "DURECEC", #Duration in Early Childhood Education and Care
    "BELONG", #School Belonging
    "PERCOMP", #Perceived School Competitiveness 
    "PERCOOP", #Perceived School Cooperation
    "ATTLNACT", #Attitudes Towards Learning Activities
    "DISCLIMA", #Disciplinary Climate (Language Lessons)
    "TEACHSUP", #Teacher Support (Language Lessons)
    "DIRINS", #Teacher-Directed Instruction 
    "PERFEED", #Perceived Feedback from Teachers
    "STIMREAD", #Teacher's Stimulation of Reading Engagement
    "ADAPTIVITY", #Adapation of Instruction
    "TEACHINT", #Perceived Teacher Interest 

    #Macrosystem/Exosystem-level Predictors
    "ESCS", #Family Socioeconomic Status(Index)
    "EDUSHORT", #Shortage of Educational Resources
    "RATCMP1", #Number of Computers per Student
    "RATCMP2", #Percentage of Computers Connected to the Internet
]

print("Loading student data...")

stu_df, _ = pyreadstat.read_sav(STU_PATH, apply_value_formats=False) #loads student data
stu_df = stu_df[stu_df["CNT"] == "JPN"] #filters for Japan data onlhy

print("Loading school data...")

sch_df, _ = pyreadstat.read_sav(SCH_PATH, apply_value_formats=False) #loads school data
sch_jpn = sch_df[sch_df["CNT"] == "JPN"][["CNTSCHID","RATCMP1","RATCMP2","EDUSHORT"]] #filters for Japan data only 

print("Merging student + school data...")

df = stu_df.merge(sch_jpn, on = "CNTSCHID", how = "left") #merges student and school data

DV = "BEINGBULLIED" #dependent variable of BeingBullied
df = df[PREDICTORS + [DV, "CNTSCHID"]] #selects predictors and dependent variable

missing = (df.isna().mean() * 100).sort_values(ascending=False) #calculates percentage of missing values for each variable
print("\nTop 20 variables by % missing (Japan):") 
print(missing.head(20).round(1)) 

out_path = Path("../data/processed/japan_clean.pkl") #path to save the cleaned data
df.to_pickle(out_path)
print(f"\nSaved cleaned Japan data → {out_path} (rows: {len(df)})")

Loading student data...
Loading school data...
Merging student + school data...

Top 20 variables by % missing (Japan):
REPEAT          100.0
BSMJ             22.2
DURECEC          20.6
PERCOOP           4.2
BEINGBULLIED      3.2
PERCOMP           3.1
METASUM           3.0
METASPAM          3.0
UNDREM            2.8
EMOSUPS           2.4
BELONG            1.9
EUDMO             1.9
MASTGOAL          1.8
WORKMAST          1.8
SCREADCOMP        1.7
GFOFAIL           1.7
RESILIENCE        1.6
ADAPTIVITY        1.5
PERFEED           1.5
SCREADDIFF        1.5
dtype: float64

Saved cleaned Japan data → ..\data\processed\japan_clean.pkl (rows: 6109)


In [3]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

df = pd.read_pickle("../data/processed/japan_clean.pkl") #loads the processed Japan data 

df = df.drop(columns = ["REPEAT"]) #drops the REPEAT column

DV = "BEINGBULLIED" 
X = df.drop(columns = [DV, "CNTSCHID"]) #drop DV + School ID and has only features
y = df[DV].copy() #copy the dependent variable

X["ST004D01T"] = X["ST004D01T"].map({1:0, 2:1}) #maps male variable to 0 and female variable to 1

num_cols = X.columns.tolist() #stores the names of all features 

imputer = SimpleImputer(strategy = "median")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns = num_cols, index = X.index) #fills in missing values using the median of each column 

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns = num_cols, index = X.index) #standardize the data so that mean = 0 and STD = 1

jpn_final = X_scaled.copy() 
jpn_final[DV] = y.values #combines the scaled features with the dependent variable back into single dataframe

out_final = Path("../data/processed/japan_clean_final.pkl") #saves data to a new file 
jpn_final.to_pickle(out_final)
print(f"Saved final cleaned Japan data → {out_final} (shape: {jpn_final.shape})")



Saved final cleaned Japan data → ..\data\processed\japan_clean_final.pkl (shape: (6109, 34))
