In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('/workspaces/mini_project_2/data/uk_pisa_2022.csv')

In [11]:
df.shape

(12972, 1278)

In [12]:
df.columns

Index(['CNT', 'CNTRYID', 'CNTSCHID', 'CNTSTUID', 'CYC', 'NatCen', 'STRATUM',
       'SUBNATIO', 'REGION', 'OECD',
       ...
       'PV3MPRE', 'PV4MPRE', 'PV5MPRE', 'PV6MPRE', 'PV7MPRE', 'PV8MPRE',
       'PV9MPRE', 'PV10MPRE', 'SENWT', 'VER_DAT'],
      dtype='object', length=1278)

In [14]:
df["W_FSTUWT"].value_counts

<bound method IndexOpsMixin.value_counts of 0          4.77923
1          8.23968
2        136.31080
3         84.42297
4        110.62140
           ...    
12967     14.72061
12968     15.33406
12969     20.97463
12970     16.30511
12971     15.15709
Name: W_FSTUWT, Length: 12972, dtype: float64>

In [17]:
df["BODYIMA"].value_counts

<bound method IndexOpsMixin.value_counts of 0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
12967   NaN
12968   NaN
12969   NaN
12970   NaN
12971   NaN
Name: BODYIMA, Length: 12972, dtype: float64>

In [5]:
# Select relevant variables
df = df[['CNTSTUID', 'CNTSCHID','PV1MATH', 'PV1READ', 'PV1SCIE', 'ESCS', 'ST004D01T', 'IMMIG', 'REGION']]
df = df.dropna()

In [5]:
# Map region codes to region names
region_map = {
    82611.0: 'England',
    82612.0: 'Northern Ireland',
    82613.0: 'Wales',
    82620.0: 'Scotland'
}
df['Region_Name'] = df['REGION'].map(region_map)


In [6]:
# Map gender codes to labels
gender_map = {
    1: 'Male',
    2: 'Female'
}
df['Gender_Label'] = df['ST004D01T'].map(gender_map)

In [7]:
# Map immigrant status codes to labels
immig_map = {
    1: 'Native',
    2: 'Second-Generation Immigrant',
    3: 'First-Generation Immigrant'
}
df['Immigrant_Status'] = df['IMMIG'].map(immig_map)

In [8]:
# Add SES group using quartiles
df['SES_Group'] = pd.qcut(df['ESCS'], q=4, labels=['Low', 'Lower-Mid', 'Upper-Mid', 'High'])

In [9]:
# Compute average academic score
df['avg_score'] = df[['PV1MATH', 'PV1READ', 'PV1SCIE']].mean(axis=1)

In [10]:
# Calculate medians for performance thresholding
median_math = df['PV1MATH'].median()
median_read = df['PV1READ'].median()
median_scie = df['PV1SCIE'].median()
median_score = df['avg_score'].median()

In [11]:
# Define At-Risk: underperforming in all three subjects
df['At_Risk'] = (
    (df['PV1MATH'] < median_math) &
    (df['PV1READ'] < median_read) &
    (df['PV1SCIE'] < median_scie)
).astype(int)

In [12]:
# Define binary label for modeling: low SES + avg_score > median (this is our target variable)
df['is_resilient'] = ((df['SES_Group'] == 'Low') & (df['avg_score'] > median_score)).astype(int)

In [13]:
# For summary statistics only: Resilient students with high performance in ALL subjects
df['Resilient_All_Subjects'] = (
    (df['SES_Group'] == 'Low') &
    (df['PV1MATH'] > median_math) &
    (df['PV1READ'] > median_read) &
    (df['PV1SCIE'] > median_scie)
).astype(int)

In [14]:
# Create regional summary table for descriptive stats
df_compare = df[['CNTSTUID', 'SES_Group', 'Region_Name', 'At_Risk', 'Resilient_All_Subjects']].copy()
summary = df_compare.groupby(['SES_Group', 'Region_Name'], observed=True).agg(
    At_Risk_Students=('At_Risk', 'sum'),
    Resilient_Students=('Resilient_All_Subjects', 'sum'),
    Total_Students=('CNTSTUID', 'count')
).reset_index()

In [15]:
# Calculate percentages
summary['At_Risk_%'] = (summary['At_Risk_Students'] / summary['Total_Students'] * 100).round(1)
summary['Resilient_%'] = (summary['Resilient_Students'] / summary['Total_Students'] * 100).round(1)

In [16]:
# Display the summary table
print("Summary Statistics by SES Group and Region:")
print(summary)

Summary Statistics by SES Group and Region:
    SES_Group       Region_Name  At_Risk_Students  Resilient_Students  \
0         Low           England               353                 212   
1         Low  Northern Ireland               263                  98   
2         Low          Scotland               441                 145   
3         Low             Wales               321                  93   
4   Lower-Mid           England               301                   0   
5   Lower-Mid  Northern Ireland               205                   0   
6   Lower-Mid          Scotland               316                   0   
7   Lower-Mid             Wales               285                   0   
8   Upper-Mid           England               278                   0   
9   Upper-Mid  Northern Ireland               151                   0   
10  Upper-Mid          Scotland               212                   0   
11  Upper-Mid             Wales               233                   0   
12     

In [17]:
# Display basic info about the target variable
print(f"\nTarget Variable Info:")
print(f"Total low-SES students: {len(df[df['SES_Group'] == 'Low'])}")
print(f"Resilient students (low-SES + above median avg score): {df['is_resilient'].sum()}")
print(f"Resilience rate: {(df['is_resilient'].sum() / len(df[df['SES_Group'] == 'Low']) * 100):.1f}%")



Target Variable Info:
Total low-SES students: 2672
Resilient students (low-SES + above median avg score): 885
Resilience rate: 33.1%


In [18]:
# --- MODELING PREP ---

In [19]:
# Create a copy for modeling
df_model = df.copy()

In [20]:
# One-hot encode categorical variables using labeled columns (more interpretable)
df_model = pd.get_dummies(df_model, columns=['Gender_Label', 'Immigrant_Status', 'Region_Name'], drop_first=True)

In [21]:
# Convert SES to numeric ordinal (preserve ordering)
ses_numeric_map = {'Low': 0, 'Lower-Mid': 1, 'Upper-Mid': 2, 'High': 3}
df_model['SES_Group_Numeric'] = df_model['SES_Group'].map(ses_numeric_map)

In [23]:
# Drop unnecessary columns for modeling (keep both At_Risk and is_resilient as targets)
columns_to_drop = [
    'CNTSTUID', 'CNTSCHID', 'REGION', 'ST004D01T', 'IMMIG',  # IDs and original numeric codes
    'SES_Group',  # Keep numeric version instead
    'Resilient_All_Subjects'  # Only used for summary stats
]

df_model = df_model.drop(columns=columns_to_drop)

In [24]:
# Display model-ready dataframe info
print(f"\nModel-ready dataset shape: {df_model.shape}")
print(f"Target variables: At_Risk, is_resilient")
print(f"Features for modeling: {[col for col in df_model.columns if col not in ['At_Risk', 'is_resilient']]}")
print(f"\nTarget variable distributions:")
print(f"At-Risk students: {df_model['At_Risk'].sum()} ({df_model['At_Risk'].mean()*100:.1f}%)")
print(f"Resilient students: {df_model['is_resilient'].sum()} ({df_model['is_resilient'].mean()*100:.1f}%)")


Model-ready dataset shape: (10685, 14)
Target variables: At_Risk, is_resilient
Features for modeling: ['PV1MATH', 'PV1READ', 'PV1SCIE', 'ESCS', 'avg_score', 'Gender_Label_Male', 'Immigrant_Status_Native', 'Immigrant_Status_Second-Generation Immigrant', 'Region_Name_Northern Ireland', 'Region_Name_Scotland', 'Region_Name_Wales', 'SES_Group_Numeric']

Target variable distributions:
At-Risk students: 3809 (35.6%)
Resilient students: 885 (8.3%)


In [26]:
# --- PREPARE DATA FOR DUAL MODELING ---
# Define feature columns (everything except target variables)
feature_columns = [col for col in df_model.columns if col not in ['At_Risk', 'is_resilient']]
X_features = df_model[feature_columns]

print(f"\nDual Modeling Setup:")
print(f"Feature columns ({len(feature_columns)}): {feature_columns}")


Dual Modeling Setup:
Feature columns (12): ['PV1MATH', 'PV1READ', 'PV1SCIE', 'ESCS', 'avg_score', 'Gender_Label_Male', 'Immigrant_Status_Native', 'Immigrant_Status_Second-Generation Immigrant', 'Region_Name_Northern Ireland', 'Region_Name_Scotland', 'Region_Name_Wales', 'SES_Group_Numeric']


In [27]:
# MODEL 1: At-Risk Classification (All Students)
X_risk = X_features.copy()
y_risk = df_model['At_Risk']
print(f"\nAt-Risk Model:")
print(f"  - Training samples: {len(X_risk)}")
print(f"  - At-risk cases: {y_risk.sum()} ({y_risk.mean()*100:.1f}%)")
print(f"  - Class balance: {y_risk.value_counts().to_dict()}")


At-Risk Model:
  - Training samples: 10685
  - At-risk cases: 3809 (35.6%)
  - Class balance: {0: 6876, 1: 3809}


In [28]:
# MODEL 2: Resilience Classification (Low-SES Students Only)
low_ses_mask = df_model['SES_Group_Numeric'] == 0
X_resilience = X_features[low_ses_mask].copy()
y_resilience = df_model[low_ses_mask]['is_resilient']
print(f"\nResilience Model:")
print(f"  - Training samples: {len(X_resilience)} (low-SES only)")
print(f"  - Resilient cases: {y_resilience.sum()} ({y_resilience.mean()*100:.1f}%)")
print(f"  - Class balance: {y_resilience.value_counts().to_dict()}")

print(f"\nFirst few rows of model dataset:")
print(df_model.head())


Resilience Model:
  - Training samples: 2672 (low-SES only)
  - Resilient cases: 885 (33.1%)
  - Class balance: {0: 1787, 1: 885}

First few rows of model dataset:
   PV1MATH  PV1READ  PV1SCIE    ESCS   avg_score  At_Risk  is_resilient  \
0  699.809  650.831  660.872 -0.2845  670.504000        0             0   
1  454.479  482.339  461.361 -0.6117  466.059667        1             0   
2  566.143  519.729  619.621  1.2878  568.497667        0             0   
3  371.820  319.932  286.866 -0.5213  326.206000        1             0   
4  423.607  350.200  386.375  0.6203  386.727333        1             0   

   Gender_Label_Male  Immigrant_Status_Native  \
0              False                     True   
1              False                     True   
2              False                     True   
3               True                     True   
4              False                    False   

   Immigrant_Status_Second-Generation Immigrant  Region_Name_Northern Ireland  \
0       

In [29]:
# Save datasets for both models
df_model.to_csv('/workspaces/mini_project_2/data/uk_pisa_model_ready.csv', index=False)

In [31]:
# Save separate datasets for convenience
df_risk_model = pd.concat([X_risk, y_risk], axis=1)
df_resilience_model = pd.concat([X_resilience, y_resilience], axis=1)

df_risk_model.to_csv('/workspaces/mini_project_2/data/uk_pisa_risk_model.csv', index=False)
df_resilience_model.to_csv('/workspaces/mini_project_2/data/uk_pisa_resilience_model.csv', index=False)

print(f"\nDatasets saved:")
print(f"  - Full dataset: /workspaces/mini_project_2/data/uk_pisa_model_ready.csv")
print(f"  - Risk model data: /workspaces/mini_project_2/data/uk_pisa_risk_model.csv")
print(f"  - Resilience model data: /workspaces/mini_project_2/data/uk_pisa_resilience_model.csv")

print(f"\n🎯 Ready for Model Training!")
print(f"Next steps:")
print(f"  1. Train At-Risk models using X_risk, y_risk")
print(f"  2. Train Resilience models using X_resilience, y_resilience") 
print(f"  3. Compare algorithm performance for each problem")
print(f"  4. Analyze feature importance differences")


Datasets saved:
  - Full dataset: /workspaces/mini_project_2/data/uk_pisa_model_ready.csv
  - Risk model data: /workspaces/mini_project_2/data/uk_pisa_risk_model.csv
  - Resilience model data: /workspaces/mini_project_2/data/uk_pisa_resilience_model.csv

🎯 Ready for Model Training!
Next steps:
  1. Train At-Risk models using X_risk, y_risk
  2. Train Resilience models using X_resilience, y_resilience
  3. Compare algorithm performance for each problem
  4. Analyze feature importance differences
