In [77]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [78]:
df=pd.read_csv('../datasets/survey_data_after_EDA.csv')
df.head()

Unnamed: 0,Gender,Age,Sleep Duration,Working Professional or Student,Profession,Study Degree,CGPA,Work/Study Hours,Academic/Work Pressure,Study/Job Satisfaction,Financial Stress,Dietary Habits,Suicidal Thoughts,Family Mental Illness History,Marital Status,Religious Person,Living with Family,Living Environment Satisfaction,Night/Day Sleep Preference,Physical Activity Frequency,Smoking,Alcohol Consumption,Social Media Usage Hours,Body Image Satisfaction,Self-Comparison,Loneliness Frequency,Depression,Age_Group
0,Male,20,More than 8 hours,Student,Student,Master’s Degree,7.0,11,4,4,3,Moderate,No,yes,Single,yes,no,yes,At night,Sometimes,no,no,Less than 1 hour,5,No,Never,No,20-22
1,Male,23,5-6 hours,Student,Student,Bachelor’s Degree,8.0,5,3,4,4,Healthy,No,no,Single,yes,no,yes,At night,Regularly,no,no,1-2 hours,2,Yes,Sometimes,No,23-25
2,Male,22,5-6 hours,Working Professional,Unknown,PhD,7.0,3,3,2,4,Moderate,No,no,Single,yes,yes,no,At night,Sometimes,yes,no,1-2 hours,4,No,Never,No,20-22
3,Male,18,5-6 hours,Working Professional,E-commerce,High School,7.0,6,3,4,1,Moderate,No,yes,Single,yes,yes,no,At night,Never,no,no,2-4 hours,3,Yes,Sometimes,No,17-19
4,Male,17,Less than 5 hours,Student,Student,High School,6.0,6,2,2,4,Unhealthy,No,no,Single,yes,yes,no,At night,Never,no,no,1-2 hours,4,No,Sometimes,No,17-19


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 28 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Gender                           120 non-null    object 
 1   Age                              120 non-null    int64  
 2   Sleep Duration                   120 non-null    object 
 3   Working Professional or Student  120 non-null    object 
 4   Profession                       120 non-null    object 
 5   Study Degree                     120 non-null    object 
 6   CGPA                             120 non-null    float64
 7   Work/Study Hours                 120 non-null    int64  
 8   Academic/Work Pressure           120 non-null    int64  
 9   Study/Job Satisfaction           120 non-null    int64  
 10  Financial Stress                 120 non-null    int64  
 11  Dietary Habits                   120 non-null    object 
 12  Suicidal Thoughts     

In [80]:
nominal_low = [
    'Gender', 'Working Professional or Student', 'Suicidal Thoughts', 
    'Family Mental Illness History', 'Study Degree', 'Marital Status', 'Religious Person', 
    'Living with Family', 'Living Environment Satisfaction', 
    'Night/Day Sleep Preference', 'Smoking', 'Alcohol Consumption','Self-Comparison' 
]
nominal_high = ['Profession']
nominal = nominal_low + nominal_high

In [81]:
ordinal = ['Sleep Duration', 'Dietary Habits', 'Age_Group', 'Social Media Usage Hours', 'Physical Activity Frequency','Loneliness Frequency']
ordinal_number = ['Academic/Work Pressure', 'Study/Job Satisfaction', 'Financial Stress','Body Image Satisfaction']

In [82]:
number = ['Age', 'CGPA', 'Work/Study Hours']

In [83]:
df['Loneliness Frequency'].unique()

array(['Never', 'Sometimes', 'Frequently'], dtype=object)

In [84]:
def update_object(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].astype('category')
            
update_object(df)

In [85]:
def preprocess_category(df): 
    for column in ordinal_number:
        df[column] = pd.Categorical(df[column], categories=[0, 1, 2, 3, 4, 5], ordered=True)
    df['Sleep Duration'] = pd.Categorical(df['Sleep Duration'], categories = ['Less than 5 hours','5-6 hours','7-8 hours','More than 8 hours'],ordered = True)
    df['Dietary Habits'] = pd.Categorical(df['Dietary Habits'], categories = ['Unhealthy','Moderate','Healthy'],ordered = True)
    df['Social Media Usage Hours'] = pd.Categorical(df['Social Media Usage Hours'], categories = ['Less than 1 hour','1-2 hours','2-4 hours', '4-6 hours', 'More than 6 hours'],ordered = True)
    df['Physical Activity Frequency'] = pd.Categorical(df['Physical Activity Frequency'], categories = ['Never','Sometimes','Regularly'],ordered = True)
    df['Loneliness Frequency'] = pd.Categorical(df['Loneliness Frequency'], categories = ['Never','Sometimes','Frequently'],ordered = True)
preprocess_category(df)

In [86]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, drop='first')

# Encode nominal columns in the df
encoded_data_train = encoder.fit_transform(df[nominal])
encoded_columns_train = encoder.get_feature_names_out(nominal)
encoded_df_train = pd.DataFrame(encoded_data_train, columns=encoded_columns_train)
encoded_df_train = encoded_df_train.astype(int)
encoded_df_train.index = df.index
df_final = pd.concat([df.drop(columns=nominal), encoded_df_train], axis=1)

In [87]:
from sklearn.preprocessing import LabelEncoder

def label_encode_ordinal_columns(data, ordinal):
    label_encoders = {}
    
    # Apply label encoding to the 'combine' (train) DataFrame for ordinal columns
    for col in ordinal:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    
    return data

df_final = label_encode_ordinal_columns(df_final, ordinal)


In [88]:
label_encoder = LabelEncoder()
df_final['Depression'] = label_encoder.fit_transform(df_final['Depression'])

# To verify the mapping
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Mapping:", mapping)

Mapping: {'No': 0, 'Yes': 1}


In [89]:
df_final.head()

Unnamed: 0,Age,Sleep Duration,CGPA,Work/Study Hours,Academic/Work Pressure,Study/Job Satisfaction,Financial Stress,Dietary Habits,Physical Activity Frequency,Social Media Usage Hours,Body Image Satisfaction,Loneliness Frequency,Depression,Age_Group,Gender_Male,Working Professional or Student_Working Professional,Suicidal Thoughts_Yes,Family Mental Illness History_yes,Study Degree_Engineering Diploma,Study Degree_High School,Study Degree_Master’s Degree,Study Degree_PhD,Study Degree_Preparatory Classes,Marital Status_Single,Religious Person_yes,Living with Family_yes,Living Environment Satisfaction_yes,Night/Day Sleep Preference_During the day,Smoking_yes,Alcohol Consumption_yes,Self-Comparison_Yes,Profession_Craftsman,Profession_Decorator,Profession_E-commerce,Profession_Student,Profession_Unknown,Profession_Warehouse Worker
0,20,3,7.0,11,4,4,3,1,2,3,5,1,0,1,1,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0
1,23,0,8.0,5,3,4,4,0,1,0,2,2,0,2,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,1,0,0
2,22,0,7.0,3,3,2,4,1,2,0,4,1,0,1,1,1,0,0,0,0,0,1,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0
3,18,0,7.0,6,3,4,1,1,0,1,3,2,0,0,1,1,0,1,0,1,0,0,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0
4,17,2,6.0,6,2,2,4,2,0,0,4,2,0,0,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0


In [90]:
# Rename the column
df_final.rename(columns={"Family Mental Illness History_yes": "Family Mental Illness History_Yes"}, inplace=True)

In [91]:
df_final.to_csv('../datasets/survey_data_transformed.csv', index=False)