# students mental health Classification Notebook:
This notebook loads data and prepares for analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
print('Notebook loaded successfully')

In [None]:
# Read the saved data
print("=" * 70)
print("READING SAVED DATA")
print("=" * 70)

try:
    df = pd.read_csv('processed_dataset2.csv')
    print(f"✅ Dataset loaded successfully!")
    print(f" Shape: {df.shape}")
    print(f" Columns: {len(df.columns)}")
    print(f" Total records: {len(df)}")
except FileNotFoundError:
    print(" File 'processed_data.csv' not found.")
    exit()

In [None]:
# =============================================================================
# STUDENTS DATA PROCESSING
# =============================================================================
print("\n" + "=" * 70)
print("PROCESSING STUDENTS DATA")
print("=" * 70)

students_df = df[df["service_year_teacher"].isnull()]
print(f"Initial students dataset shape: {students_df.shape}")

#----------------------------------------------------------------------
# STEP 1: REMOVE NOT RELEVANT VARIABLES FOR STUDENTS
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 1: REMOVING NOT RELEVANT VARIABLES FOR STUDENTS")
print("=" * 70)

columns_to_drop = ["service_year_teacher", "OSLO1", "OSLO2", "OSLO3"]
existing_drop_cols = [col for col in columns_to_drop if col in students_df.columns]

if existing_drop_cols:
    print(f"Removed columns: {', '.join(existing_drop_cols)}")
    students_df = students_df.drop(columns=existing_drop_cols)
    print(f"✅ Dropped {len(existing_drop_cols)} irrelevant columns")

print(f"Dataset shape: {students_df.shape}")

In [None]:
#----------------------------------------------------------------------
# STEP 2: FIRST CHECK NULL VALUES
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 2: COMPREHENSIVE NULL VALUE CHECK")
print("=" * 70)

null_summary = students_df.isnull().sum()
total_null_columns = (null_summary > 0).sum()
total_null_values = null_summary.sum()

print(f"Total columns with null values: {total_null_columns}")
print(f"Total null values in dataset: {total_null_values}")

if total_null_columns > 0:
    null_analysis = []
    null_columns_list = []
    for col in students_df.columns:
        null_count = students_df[col].isnull().sum()
        if null_count > 0:
            null_percentage = (null_count / len(students_df)) * 100
            null_analysis.append({
                'Column': col,
                'Null_Count': null_count,
                'Null_Percentage': null_percentage,
                'Data_Type': students_df[col].dtype
            })
            null_columns_list.append(col)

    print(f"Columns with null values: {', '.join(null_columns_list)}")
    null_df = pd.DataFrame(null_analysis)
else:
    print("✅ No null values found")

#----------------------------------------------------------------------
# STEP 3: REMOVE HIGH NULL VALUE COLUMNS
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 3: REMOVING HIGH NULL VALUE COLUMNS")
print("=" * 70)

high_missing_columns = ['Alcohol_2', 'Alcohol_3', 'Alcohol_4', 'Alcohol_5', 'Alcohol_6', 'alcohol_7',
                       'tobaco_2', 'tobaco_3', 'tobaco_4', 'tobaco_5', 'tobaco_6', 'tobaco_7',
                       'khat_2', 'khat_3', 'khat_4', 'khat_5', 'khat_6', 'khat_7']

if 'null_df' in locals():
    current_high_null_cols = null_df[null_df['Null_Percentage'] > 50]['Column'].tolist()
    all_high_null_cols = list(set(high_missing_columns + current_high_null_cols))
else:
    all_high_null_cols = high_missing_columns

existing_high_null_cols = [col for col in all_high_null_cols if col in students_df.columns]

if existing_high_null_cols:
    print(f"Columns to remove: {', '.join(existing_high_null_cols)}")
    initial_cols = len(students_df.columns)
    students_df = students_df.drop(columns=existing_high_null_cols)
    final_cols = len(students_df.columns)
    print(f"✅ Removed {len(existing_high_null_cols)} high null columns")
else:
    print("✅ No high null columns to remove")

print(f"Current dataset shape: {students_df.shape}")

In [None]:
#----------------------------------------------------------------------
# STEP 4: HANDLE REMAINING MISSING VALUES
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 4: HANDLING REMAINING MISSING VALUES")
print("=" * 70)

remaining_null_summary = students_df.isnull().sum()
remaining_null_summary = remaining_null_summary[remaining_null_summary > 0]

if len(remaining_null_summary) > 0:
    remaining_cols = list(remaining_null_summary.index)
    print(f"Columns with null values: {', '.join(remaining_cols)}")

    academic_cols = ['average', 'rank']
    academic_cols_to_fill = [col for col in academic_cols if col in students_df.columns and students_df[col].isnull().sum() > 0]
    if academic_cols_to_fill:
        print(f"Filling academic columns: {', '.join(academic_cols_to_fill)}")
        for col in academic_cols_to_fill:
            students_df[col] = students_df[col].fillna(students_df[col].median())

    substance_cols = ['Alcohol_1', 'tobaco_1', 'khat_1']
    substance_cols_to_fill = [col for col in substance_cols if col in students_df.columns and students_df[col].isnull().sum() > 0]
    if substance_cols_to_fill:
        print(f"Filling substance use columns: {', '.join(substance_cols_to_fill)}")
        for col in substance_cols_to_fill:
            students_df[col] = students_df[col].fillna(students_df[col].median())

    other_null_cols = [col for col in remaining_cols if col not in academic_cols + substance_cols]
    if other_null_cols:
        print(f"Other columns with null values: {', '.join(other_null_cols)}")
        for col in other_null_cols:
            if students_df[col].dtype in ['int64', 'float64']:
                students_df[col] = students_df[col].fillna(students_df[col].median())
            else:
                mode_value = students_df[col].mode()[0] if len(students_df[col].mode()) > 0 else 'Unknown'
                students_df[col] = students_df[col].fillna(mode_value)
else:
    print("✅ No remaining null values to handle")

In [None]:
#----------------------------------------------------------------------
# STEP 5: CREATE AGGREGATED FEATURES AND REMOVE INDIVIDUAL ITEMS
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 5: CREATING AGGREGATED FEATURES")
print("=" * 70)

# Create SRQ total score
print("Creating SRQ_total (Mental Health Assessment)...")
srq_columns = [f'SRQ{i}' for i in range(1, 21)]
existing_srq_cols = [col for col in srq_columns if col in students_df.columns]
if existing_srq_cols:
    srq_nulls = students_df[existing_srq_cols].isnull().sum().sum()
    if srq_nulls == 0:
        students_df['SRQ_total'] = students_df[existing_srq_cols].sum(axis=1)
        print(f"✅ Created SRQ_total from {len(existing_srq_cols)} SRQ items")
        print(f" SRQ_total stats: Min={students_df['SRQ_total'].min()}, Max={students_df['SRQ_total'].max()}, Mean={students_df['SRQ_total'].mean():.2f}")

        # Remove individual SRQ columns
        students_df = students_df.drop(columns=existing_srq_cols)
        print(f" Removed individual SRQ columns")
    else:
        print(f" SRQ columns have {srq_nulls} null values")
else:
    print(" No SRQ columns found")

# Create MPSS total score
print("\nCreating MPSS_total (Social Support Assessment)...")
mpss_columns = [f'MPSS{i}' for i in range(1, 13)]
existing_mpss_cols = [col for col in mpss_columns if col in students_df.columns]
if existing_mpss_cols:
    mpss_nulls = students_df[existing_mpss_cols].isnull().sum().sum()
    if mpss_nulls == 0:
        students_df['MPSS_total'] = students_df[existing_mpss_cols].sum(axis=1)
        print(f"✅ Created MPSS_total from {len(existing_mpss_cols)} MPSS items")
        print(f" MPSS_total stats: Min={students_df['MPSS_total'].min()}, Max={students_df['MPSS_total'].max()}, Mean={students_df['MPSS_total'].mean():.2f}")

        # Remove individual MPSS columns
        students_df = students_df.drop(columns=existing_mpss_cols)
        print(f" Removed individual MPSS columns")
    else:
        print(f" MPSS columns have {mpss_nulls} null values")
else:
    print(" No MPSS columns found")

In [None]:
#----------------------------------------------------------------------
# FINAL VERIFICATION
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("FINAL VERIFICATION")
print("=" * 70)

final_null_count = students_df.isnull().sum().sum()
final_null_columns = (students_df.isnull().sum() > 0).sum()

print(f"Total null values: {final_null_count}")
print(f"Columns with null values: {final_null_columns}")

if final_null_count == 0:
    print("✅ Dataset is completely clean")
else:
    remaining_nulls = students_df.isnull().sum()
    remaining_nulls = remaining_nulls[remaining_nulls > 0]
    remaining_cols = list(remaining_nulls.index)
    print(f"Remaining null columns: {', '.join(remaining_cols)}")

print(f"\nFINAL DATASET SUMMARY:")
print(f"Shape: {students_df.shape}")
print(f"Total records: {len(students_df)}")
print(f"Total features: {len(students_df.columns)}")

print("\n" + "=" * 70)
print("STUDENTS DATA PROCESSING COMPLETED! ✅")
print("=" * 70)

print("\nSAMPLE OF FINAL STUDENTS DATASET:")
print(students_df.head(3))

In [None]:
# =============================================================================
# CATEGORICAL ENCODING AND FEATURE SELECTION
# =============================================================================
print("=" * 70)
print("CATEGORICAL ENCODING AND FEATURE SELECTION")
print("=" * 70)

print("Initial dataset shape:", students_df.shape)

#----------------------------------------------------------------------
# STEP 1: ENCODING CATEGORICAL VARIABLES
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 1: ENCODING CATEGORICAL VARIABLES")
print("=" * 70)

from sklearn.preprocessing import LabelEncoder
import pandas as pd

students_encoded = students_df.copy()
label_encoders = {}

# One-Hot Encoding for school (nominal variable)
print("One-Hot Encoding for school:")
school_dummies = pd.get_dummies(students_encoded['school'], prefix='school')
students_encoded = pd.concat([students_encoded, school_dummies], axis=1)
students_encoded = students_encoded.drop('school', axis=1)
print(f"Created {school_dummies.shape[1]} school dummy variables")

# Label Encoding for binary and ordinal variables
categorical_columns = ['sex', 'Education', 'Alcohol_1', 'tobaco_1', 'khat_1']

for col in categorical_columns:
    if col in students_encoded.columns:
        le = LabelEncoder()
        students_encoded[col] = le.fit_transform(students_encoded[col].astype(str))
        label_encoders[col] = le
        print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print(f"Encoded dataset shape: {students_encoded.shape}")

#----------------------------------------------------------------------
# STEP 2: TARGET VARIABLE PREPARATION
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 2: TARGET VARIABLE PREPARATION")
print("=" * 70)

# Regression target
print("REGRESSION TARGET: SRQ_total")

# Classification target
srq_median = students_encoded['SRQ_total'].median()
students_encoded['SRQ_total_binary'] = (students_encoded['SRQ_total'] > srq_median).astype(int)

print("CLASSIFICATION TARGET: SRQ_total_binary")
print(f"Threshold: SRQ_total > {srq_median}")
class_dist = students_encoded['SRQ_total_binary'].value_counts()
print(f"Class 0: {class_dist[0]}, Class 1: {class_dist[1]}")

#----------------------------------------------------------------------
# STEP 3: FEATURE SELECTION FOR REGRESSION
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 3: FEATURE SELECTION FOR REGRESSION")
print("=" * 70)

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor

regression_features = [col for col in students_encoded.columns
                      if col not in ['ID', 'SRQ_total', 'SRQ_total_binary']]
X_reg = students_encoded[regression_features]
y_reg = students_encoded['SRQ_total']