# Exploring and Clean Up
### Methodology
1. Explore the data types of each column
2. Change to the correct data type those columns that need it 

In [1]:
import ast
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Load the dataset
df = pd.read_csv('data\post_pandemic_remote_work_health_impact_2025.csv')


In [2]:
# View the first few rows of the dataset
df.head()

Unnamed: 0,Survey_Date,Age,Gender,Region,Industry,Job_Role,Work_Arrangement,Hours_Per_Week,Mental_Health_Status,Burnout_Level,Work_Life_Balance_Score,Physical_Health_Issues,Social_Isolation_Score,Salary_Range
0,2025-06-01,27,Female,Asia,Professional Services,Data Analyst,Onsite,64,Stress Disorder,High,3,Shoulder Pain; Neck Pain,2,$40K-60K
1,2025-06-01,37,Female,Asia,Professional Services,Data Analyst,Onsite,37,Stress Disorder,High,4,Back Pain,2,$80K-100K
2,2025-06-01,32,Female,Africa,Education,Business Analyst,Onsite,36,ADHD,High,3,Shoulder Pain; Eye Strain,2,$80K-100K
3,2025-06-01,40,Female,Europe,Education,Data Analyst,Onsite,63,ADHD,Medium,1,Shoulder Pain; Eye Strain,2,$60K-80K
4,2025-06-01,30,Male,South America,Manufacturing,DevOps Engineer,Hybrid,65,,Medium,5,,4,$60K-80K


In [3]:
# Explore the data types from each column
df.dtypes

Survey_Date                object
Age                         int64
Gender                     object
Region                     object
Industry                   object
Job_Role                   object
Work_Arrangement           object
Hours_Per_Week              int64
Mental_Health_Status       object
Burnout_Level              object
Work_Life_Balance_Score     int64
Physical_Health_Issues     object
Social_Isolation_Score      int64
Salary_Range               object
dtype: object

In [4]:
# Create a copy of the DataFrame to avoid modifying the original data
df_survey = df.copy()

In [5]:
# Transform the 'date' column to datetime format
df_survey['Survey_Date'] = pd.to_datetime(df_survey['Survey_Date'], format='%Y-%m-%d')
# Verify if the transformation was successful
df_survey['Survey_Date'].dtype

dtype('<M8[ns]')

In [6]:
# Check the number of unique values in each column
df_survey.nunique()

Survey_Date                26
Age                        44
Gender                      4
Region                      6
Industry                    9
Job_Role                   24
Work_Arrangement            3
Hours_Per_Week             31
Mental_Health_Status        6
Burnout_Level               3
Work_Life_Balance_Score     5
Physical_Health_Issues     31
Social_Isolation_Score      5
Salary_Range                5
dtype: int64

In [7]:
# verify if the data contains any missing values
df_survey.isnull().sum()


Survey_Date                  0
Age                          0
Gender                       0
Region                       0
Industry                     0
Job_Role                     0
Work_Arrangement             0
Hours_Per_Week               0
Mental_Health_Status       799
Burnout_Level                0
Work_Life_Balance_Score      0
Physical_Health_Issues     280
Social_Isolation_Score       0
Salary_Range                 0
dtype: int64

In [8]:
# Transform Physical_Health Issues to a list and delete the leading and trailing spaces
df_survey['Physical_Health_Issues'] = df_survey['Physical_Health_Issues'].apply(lambda x: [item.strip() for item in x.split(';')] if pd.notnull(x) else x)


In [9]:
df_survey.dtypes

Survey_Date                datetime64[ns]
Age                                 int64
Gender                             object
Region                             object
Industry                           object
Job_Role                           object
Work_Arrangement                   object
Hours_Per_Week                      int64
Mental_Health_Status               object
Burnout_Level                      object
Work_Life_Balance_Score             int64
Physical_Health_Issues             object
Social_Isolation_Score              int64
Salary_Range                       object
dtype: object

In [10]:
# Replace the NaN values in Mental_Health_Status with 'Healthy'
df_survey['Mental_Health_Status'] = df_survey['Mental_Health_Status'].fillna('Non-Diagnosis')

In [11]:
# Check for any remaining missing values
df_survey.isnull().sum()

Survey_Date                  0
Age                          0
Gender                       0
Region                       0
Industry                     0
Job_Role                     0
Work_Arrangement             0
Hours_Per_Week               0
Mental_Health_Status         0
Burnout_Level                0
Work_Life_Balance_Score      0
Physical_Health_Issues     280
Social_Isolation_Score       0
Salary_Range                 0
dtype: int64