# students mental health Classification Notebook:
This notebook loads data and prepares for analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
print('Notebook loaded successfully')

In [None]:
# Read the saved data
print("=" * 70)
print("READING SAVED DATA")
print("=" * 70)

try:
    df = pd.read_csv('processed_dataset2.csv')
    print(f"✅ Dataset loaded successfully!")
    print(f" Shape: {df.shape}")
    print(f" Columns: {len(df.columns)}")
    print(f" Total records: {len(df)}")
except FileNotFoundError:
    print(" File 'processed_data.csv' not found.")
    exit()

In [None]:
# =============================================================================
# STUDENTS DATA PROCESSING
# =============================================================================
print("\n" + "=" * 70)
print("PROCESSING STUDENTS DATA")
print("=" * 70)

students_df = df[df["service_year_teacher"].isnull()]
print(f"Initial students dataset shape: {students_df.shape}")

#----------------------------------------------------------------------
# STEP 1: REMOVE NOT RELEVANT VARIABLES FOR STUDENTS
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 1: REMOVING NOT RELEVANT VARIABLES FOR STUDENTS")
print("=" * 70)

columns_to_drop = ["service_year_teacher", "OSLO1", "OSLO2", "OSLO3"]
existing_drop_cols = [col for col in columns_to_drop if col in students_df.columns]

if existing_drop_cols:
    print(f"Removed columns: {', '.join(existing_drop_cols)}")
    students_df = students_df.drop(columns=existing_drop_cols)
    print(f"✅ Dropped {len(existing_drop_cols)} irrelevant columns")

print(f"Dataset shape: {students_df.shape}")

In [None]:
#----------------------------------------------------------------------
# STEP 2: FIRST CHECK NULL VALUES
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 2: COMPREHENSIVE NULL VALUE CHECK")
print("=" * 70)

null_summary = students_df.isnull().sum()
total_null_columns = (null_summary > 0).sum()
total_null_values = null_summary.sum()

print(f"Total columns with null values: {total_null_columns}")
print(f"Total null values in dataset: {total_null_values}")

if total_null_columns > 0:
    null_analysis = []
    null_columns_list = []
    for col in students_df.columns:
        null_count = students_df[col].isnull().sum()
        if null_count > 0:
            null_percentage = (null_count / len(students_df)) * 100
            null_analysis.append({
                'Column': col,
                'Null_Count': null_count,
                'Null_Percentage': null_percentage,
                'Data_Type': students_df[col].dtype
            })
            null_columns_list.append(col)

    print(f"Columns with null values: {', '.join(null_columns_list)}")
    null_df = pd.DataFrame(null_analysis)
else:
    print("✅ No null values found")

#----------------------------------------------------------------------
# STEP 3: REMOVE HIGH NULL VALUE COLUMNS
#----------------------------------------------------------------------
print("\n" + "=" * 70)
print("STEP 3: REMOVING HIGH NULL VALUE COLUMNS")
print("=" * 70)

high_missing_columns = ['Alcohol_2', 'Alcohol_3', 'Alcohol_4', 'Alcohol_5', 'Alcohol_6', 'alcohol_7',
                       'tobaco_2', 'tobaco_3', 'tobaco_4', 'tobaco_5', 'tobaco_6', 'tobaco_7',
                       'khat_2', 'khat_3', 'khat_4', 'khat_5', 'khat_6', 'khat_7']

if 'null_df' in locals():
    current_high_null_cols = null_df[null_df['Null_Percentage'] > 50]['Column'].tolist()
    all_high_null_cols = list(set(high_missing_columns + current_high_null_cols))
else:
    all_high_null_cols = high_missing_columns

existing_high_null_cols = [col for col in all_high_null_cols if col in students_df.columns]

if existing_high_null_cols:
    print(f"Columns to remove: {', '.join(existing_high_null_cols)}")
    initial_cols = len(students_df.columns)
    students_df = students_df.drop(columns=existing_high_null_cols)
    final_cols = len(students_df.columns)
    print(f"✅ Removed {len(existing_high_null_cols)} high null columns")
else:
    print("✅ No high null columns to remove")

print(f"Current dataset shape: {students_df.shape}")
