In [6]:
!pip3 install kagglehub

Defaulting to user installation because normal site-packages is not writeable
Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 2.9 MB/s eta 0:00:01
[?25hCollecting pyyaml
  Downloading PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl (172 kB)
[K     |████████████████████████████████| 172 kB 7.6 MB/s eta 0:00:01
Installing collected packages: pyyaml, kagglehub
Successfully installed kagglehub-0.3.10 pyyaml-6.0.2
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [9]:
#!/usr/bin/env python
# coding: utf-8

import os
import pandas as pd
import numpy as np

os.makedirs("../data", exist_ok=True)

df = pd.read_csv("../data/raw/student_depression_dataset.csv")

print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")

print("\nDataset Overview:")
print(df.info())

print("\nSummary Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

def clean_data(df):
    cleaned_df = df.copy()
    
    numerical_cols = cleaned_df.select_dtypes(include=['int64', 'float64']).columns
    for col in numerical_cols:
        cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].median())
    
    categorical_cols = cleaned_df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        cleaned_df[col] = cleaned_df[col].fillna(cleaned_df[col].mode()[0])
    
    for col in categorical_cols:
        cleaned_df[col] = cleaned_df[col].str.strip().str.lower()
    
    if 'Gender' in cleaned_df.columns:
        cleaned_df['Gender'] = cleaned_df['Gender'].map({'male': 0, 'female': 1})
    
    for col in numerical_cols:
        cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
    
    if 'Academic Pressure' in cleaned_df.columns and 'Work Pressure' in cleaned_df.columns:
        cleaned_df['Total Pressure'] = cleaned_df['Academic Pressure'] + cleaned_df['Work Pressure']
    
    cleaned_df = cleaned_df.drop_duplicates()
    
    return cleaned_df

print("\nCleaning data...")
cleaned_df = clean_data(df)

print(f"Cleaned dataset has {cleaned_df.shape[0]} rows and {cleaned_df.shape[1]} columns")
print("\nMissing values after cleaning:")
print(cleaned_df.isnull().sum())

output_path = "../data/student_depression_cleaned.parquet"
cleaned_df.to_parquet(output_path, index=False)
print(f"\nCleaned dataset saved as: {output_path}")

print("\nSample of cleaned dataset:")
print(cleaned_df.head())

print("\nColumn descriptions:")
for col in cleaned_df.columns:
    print(f"- {col}: {cleaned_df[col].dtype}")

print("\nData preparation complete!")

Dataset loaded with 27901 rows and 18 columns

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep