In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/workspaces/mini_project_2/data/pisa_2022_uk_selected.csv")

In [3]:
df.head()

Unnamed: 0,PV1MATH,PV2MATH,PV3MATH,PV4MATH,PV5MATH,PV6MATH,PV7MATH,PV8MATH,PV9MATH,PV10MATH,...,ST270Q02JA,ST038Q05NA,ST265Q03JA,ST272Q01JA,ST034Q02TA,ST038Q03NA,ST038Q04NA,ST315Q04JA,ST315Q06JA,REGION
0,699.809,598.369,593.952,603.361,666.143,635.207,608.553,583.002,654.755,600.505,...,4.0,1.0,1.0,4.0,,1.0,1.0,,,82613.0
1,454.479,377.041,463.036,394.994,396.957,429.649,459.601,439.692,402.258,428.02,...,1.0,1.0,2.0,8.0,2.0,2.0,2.0,,,82612.0
2,566.143,616.946,556.976,656.532,652.062,611.077,657.387,645.288,646.714,565.086,...,1.0,1.0,1.0,10.0,2.0,1.0,1.0,4.0,3.0,82611.0
3,371.82,357.674,264.029,393.236,361.008,326.822,390.37,338.591,303.145,382.215,...,3.0,1.0,2.0,1.0,2.0,1.0,1.0,3.0,,82611.0
4,423.607,382.887,414.527,404.693,388.663,427.517,447.633,445.616,423.227,425.905,...,2.0,1.0,1.0,8.0,1.0,2.0,2.0,,4.0,82611.0


## 1. Recode variables 

In [4]:
# Reverse-coded variables with their original scales
# Format: "VARIABLE_NAME": (min_value, max_value)
reverse_vars = {
    "ST309Q02JA": (1, 5),  # not_distract: 1–5
    "ST258Q01JA": (1, 5),  # food_sec: 1–5
    "ST034Q03TA": (1, 4),  # schl_belong: 1–4
    "ST270Q02JA": (1, 4),  # teacher_help: 1–4
    "ST038Q05NA": (1, 4),  # safe_student: 1–4
    "ST265Q03JA": (1, 4),  # safe_class: 1–4
    "ST034Q02TA": (1, 4),  # make_friends: 1–4
    "ST038Q03NA": (1, 4),  # feel_included: 1–4
    "ST038Q04NA": (1, 4)   # no_mock: 1–4
}

In [5]:
def reverse_code(df, reverse_map):
    """
    Reverse codes selected variables in a DataFrame using their min and max scale values.

    Parameters:
    - df: pandas DataFrame containing the data
    - reverse_map: dict of variable names and their (min, max) scale values

    Returns:
    - df: DataFrame with reversed values
    """
    df = df.copy()
    for var, (min_val, max_val) in reverse_map.items():
        if var in df.columns:
            df[var] = max_val + min_val - df[var]
    return df

df = reverse_code(df, reverse_vars)

## 2. Create Binary 

In [6]:
# Binary recoding functions (no renaming applied here)
df["ST004D01T"] = df["ST004D01T"].apply(lambda x: 1 if x == 1 else 0 if x == 2 else pd.NA)  # gender: 1 = female, 0 = male
df["ST327Q06JA"] = df["ST327Q06JA"].apply(lambda x: 1 if x == 1 else 0 if x == 2 else pd.NA)  # expt_bach: drop 3 later
df["ST250Q02JA"] = df["ST250Q02JA"].apply(lambda x: 1 if x == 1 else 0 if x == 2 else pd.NA)  # has_computer
df["ST255Q01JA"] = df["ST255Q01JA"].apply(lambda x: 1 if x == 1 else 0 if x in [2,3,4,5,6,7] else pd.NA)  # has_books

In [7]:
df[["ST004D01T", "ST327Q06JA", "ST250Q02JA", "ST255Q01JA"]].head()

Unnamed: 0,ST004D01T,ST327Q06JA,ST250Q02JA,ST255Q01JA
0,0,1.0,1,0
1,0,0.0,1,0
2,0,1.0,1,0
3,1,,0,0
4,0,,1,0


## 3. Rename variables 

In [8]:
# Core renaming dictionary
rename_dict = {
    # Individual
    "ST004D01T": "gender",
    "ST016Q01NA": "life_sat",
    "ST313Q01JA": "emo_control",
    "ST309Q02JA": "not_distract",
    "ST301Q01JA": "curiosity",
    "ST305Q01JA": "comfort_lead",

    # Academic
    "ST296Q01JA": "math_hwork",
    "ST293Q03JA": "math_effort",
    "ST292Q01JA": "math_conf",
    "ST327Q06JA": "expt_bach",
    "ST355Q05JA": "conf_self_mot",

    # Family
    "ESCS": "SES",
    "ST230Q01JA": "num_sib",
    "ST258Q01JA": "food_sec",
    "ST259Q01JA": "family_stat",
    "ST300Q01JA": "parent_talk_schl",
    "ST300Q02JA": "parent_eat_with",
    "ST250Q02JA": "has_computer",
    "ST255Q01JA": "has_books",

    # School
    "ST034Q03TA": "schl_belong",
    "ST267Q01JA": "teach_respect",
    "ST267Q05JA": "teac_interest",
    "ST273Q01JA": "listen_teacher",
    "ST285Q04JA": "teacher_explain",
    "ST270Q02JA": "teacher_help",
    "ST038Q05NA": "safe_student",
    "ST265Q03JA": "safe_class",
    "ST272Q01JA": "qual_math_instruct",

    # Peer/Social
    "ST034Q02TA": "make_friends",
    "ST038Q03NA": "feel_included",
    "ST038Q04NA": "no_mock",
    "ST315Q04JA": "trust_friends",
    "ST315Q06JA": "trust_gen",

    # Contextual
    "REGION": "region"
}

In [9]:
# Add PV renaming (PV1MATH → math_1, PV1READ → read_1, etc.)
for subject in ["MATH", "READ", "SCIE"]:
    for i in range(1, 11):
        old = f"PV{i}{subject}"
        new = f"{subject.lower()}_{i}"
        rename_dict[old] = new

In [10]:
df.rename(columns=rename_dict, inplace=True)

In [11]:
grouped_vars = {
    "target": [f"math_{i}" for i in range(1, 11)],
    "reading_scores": [f"read_{i}" for i in range(1, 11)],
    "science_scores": [f"scie_{i}" for i in range(1, 11)],
    "individual": ["gender", "life_sat", "emo_control", "not_distract", "curiosity", "comfort_lead"],
    "academic": ["math_hwork", "math_effort", "math_conf", "expt_bach", "conf_self_mot"],
    "family": ["SES", "num_sib", "food_sec", "family_stat", "parent_talk_schl", "parent_eat_with", "has_computer", "has_books"],
    "school": ["schl_belong", "teach_respect", "teac_interest", "listen_teacher", "teacher_explain", "teacher_help", "safe_student", "safe_class", "qual_math_instruct"],
    "peer_social": ["make_friends", "feel_included", "no_mock", "trust_friends", "trust_gen"],
    "context": ["region"]
}

In [12]:
individual_features = grouped_vars["individual"]

In [13]:
print("Renamed columns:", df.columns.tolist())

Renamed columns: ['math_1', 'math_2', 'math_3', 'math_4', 'math_5', 'math_6', 'math_7', 'math_8', 'math_9', 'math_10', 'read_1', 'read_2', 'read_3', 'read_4', 'read_5', 'read_6', 'read_7', 'read_8', 'read_9', 'read_10', 'scie_1', 'scie_2', 'scie_3', 'scie_4', 'scie_5', 'scie_6', 'scie_7', 'scie_8', 'scie_9', 'scie_10', 'gender', 'life_sat', 'emo_control', 'not_distract', 'curiosity', 'comfort_lead', 'math_hwork', 'math_effort', 'math_conf', 'expt_bach', 'conf_self_mot', 'SES', 'num_sib', 'food_sec', 'family_stat', 'parent_talk_schl', 'parent_eat_with', 'has_computer', 'has_books', 'schl_belong', 'teach_respect', 'teac_interest', 'listen_teacher', 'teacher_explain', 'teacher_help', 'safe_student', 'safe_class', 'qual_math_instruct', 'make_friends', 'feel_included', 'no_mock', 'trust_friends', 'trust_gen', 'region']


In [14]:
print("Academic variables:", grouped_vars["academic"])

Academic variables: ['math_hwork', 'math_effort', 'math_conf', 'expt_bach', 'conf_self_mot']


## 4. Check Missing 

In [15]:
# Check number of missing values in each column
missing_values = df.isna().sum()

# Display only columns with missing values
missing_values[missing_values > 0].sort_values(ascending=False)

conf_self_mot         8268
parent_talk_schl      8035
parent_eat_with       8024
comfort_lead          7637
not_distract          7559
trust_friends         7555
trust_gen             7546
emo_control           7491
curiosity             7455
teacher_explain       7387
math_effort           7355
expt_bach             7068
teach_respect         5976
teac_interest         5901
listen_teacher        5277
math_conf             4588
schl_belong           3673
make_friends          3626
math_hwork            2452
teacher_help          2366
qual_math_instruct    2129
family_stat           1934
SES                   1888
no_mock               1842
safe_student          1837
feel_included         1823
safe_class            1708
life_sat              1702
food_sec              1702
has_computer          1635
has_books             1553
num_sib               1530
dtype: int64

In [16]:
# Percentage of missing values
missing_percent = df.isna().mean() * 100
missing_percent[missing_percent > 0].sort_values(ascending=False)


conf_self_mot         63.737280
parent_talk_schl      61.941104
parent_eat_with       61.856306
comfort_lead          58.872957
not_distract          58.271662
trust_friends         58.240826
trust_gen             58.171446
emo_control           57.747456
curiosity             57.469935
teacher_explain       56.945729
math_effort           56.699044
expt_bach             54.486586
teach_respect         46.068455
teac_interest         45.490287
listen_teacher        40.679926
math_conf             35.368486
schl_belong           28.314832
make_friends          27.952513
math_hwork            18.902251
teacher_help          18.239285
qual_math_instruct    16.412273
family_stat           14.909035
SES                   14.554425
no_mock               14.199815
safe_student          14.161270
feel_included         14.053346
safe_class            13.166821
life_sat              13.120567
food_sec              13.120567
has_computer          12.604070
has_books             11.971940
num_sib 

In [19]:
#check how many rows would be left if dropped all missing 
df_dropped = df.dropna()
print(f"Rows before dropping: {len(df)}")
print(f"Rows after dropping:  {len(df_dropped)}")
print(f"Percentage kept:      {len(df_dropped) / len(df) * 100:.2f}%")

Rows before dropping: 12972
Rows after dropping:  0
Percentage kept:      0.00%


In [20]:
#drop anythign with more than 30% missing 

In [21]:
# Step 2: Drop columns with more than 30% missing values
columns_to_drop = missing_percent[missing_percent > 30].index.tolist()
df = df.drop(columns=columns_to_drop)

# Step 3: Display summary
print("Dropped columns (over 30% missing):")
print(columns_to_drop)
print(f"\nRemaining columns: {df.shape[1]}")

Dropped columns (over 30% missing):
['emo_control', 'not_distract', 'curiosity', 'comfort_lead', 'math_effort', 'math_conf', 'expt_bach', 'conf_self_mot', 'parent_talk_schl', 'parent_eat_with', 'teach_respect', 'teac_interest', 'listen_teacher', 'teacher_explain', 'trust_friends', 'trust_gen']

Remaining columns: 48


In [24]:
# Define original columns grouped by category
categories = {
    "Individual": [
        "gender", "life_sat", "emo_control", "not_distract", "curiosity", "comfort_lead"
    ],
    "Academic": [
        "math_hwork", "math_effort", "math_conf", "expt_bach", "conf_self_mot"
    ],
    "Family": [
        "SES", "num_sib", "food_sec", "family_stat", "parent_talk_schl", "parent_eat_with", "has_computer", "has_books"
    ],
    "School": [
        "schl_belong", "teach_respect", "teac_interest", "listen_teacher",
        "teacher_explain", "teacher_help", "safe_student", "safe_class", "qual_math_instruct"
    ],
    "Peer/Social": [
        "make_friends", "feel_included", "no_mock", "trust_friends", "trust_gen"
    ],
    "Contextual": [
        "region"
    ]
}

# Filter only those that remain in the dataframe
print("\n✅ Remaining columns by category:\n")
for category, vars in categories.items():
    kept = [var for var in vars if var in df.columns]
    if kept:
        print(f"📂 {category}:")
        for col in kept:
            print(f"   - {col}")
        print()


✅ Remaining columns by category:

📂 Individual:
   - gender
   - life_sat

📂 Academic:
   - math_hwork

📂 Family:
   - SES
   - num_sib
   - food_sec
   - family_stat
   - has_computer
   - has_books

📂 School:
   - schl_belong
   - teacher_help
   - safe_student
   - safe_class
   - qual_math_instruct

📂 Peer/Social:
   - make_friends
   - feel_included
   - no_mock

📂 Contextual:
   - region

