In [4]:
import pandas as pd

In [6]:
# Load the dataset
file_path = '../data/Mental Health Dataset.csv'
df = pd.read_csv(file_path)


In [8]:
# Display basic information
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292364 entries, 0 to 292363
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   Timestamp                292364 non-null  object
 1   Gender                   292364 non-null  object
 2   Country                  292364 non-null  object
 3   Occupation               292364 non-null  object
 4   self_employed            287162 non-null  object
 5   family_history           292364 non-null  object
 6   treatment                292364 non-null  object
 7   Days_Indoors             292364 non-null  object
 8   Growing_Stress           292364 non-null  object
 9   Changes_Habits           292364 non-null  object
 10  Mental_Health_History    292364 non-null  object
 11  Mood_Swings              292364 non-null  object
 12  Coping_Struggles         292364 non-null  object
 13  Work_Interest            292364 non-null  object
 14  Social_Weakness     

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
count,292364,292364,292364,292364,287162,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364
unique,580,2,35,5,2,2,2,5,3,3,3,3,2,3,3,3,3
top,8/27/2014 11:43,Male,United States,Housewife,No,No,Yes,1-14 days,Maybe,Yes,No,Medium,No,No,Maybe,No,No
freq,2384,239850,171308,66351,257994,176832,147606,63548,99985,109523,104018,101064,154328,105843,103393,232166,118886


In [10]:
# Check for missing values in each column
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_values)

Columns with missing values:
 self_employed    5202
dtype: int64


In [12]:
# Check the distribution of values in the 'self_employed' column
df['self_employed'].value_counts()

self_employed
No     257994
Yes     29168
Name: count, dtype: int64

In [14]:
# Fill missing values in the 'self_employed' column with 'No'
df['self_employed'].fillna('No', inplace=True)

# Verify that there are no more missing values in the 'self_employed' column
missing_values = df['self_employed'].isnull().sum()
print(f"Missing values in 'self_employed' column after filling: {missing_values}")

Missing values in 'self_employed' column after filling: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['self_employed'].fillna('No', inplace=True)


In [16]:
# Display data types of all columns
df.dtypes

Timestamp                  object
Gender                     object
Country                    object
Occupation                 object
self_employed              object
family_history             object
treatment                  object
Days_Indoors               object
Growing_Stress             object
Changes_Habits             object
Mental_Health_History      object
Mood_Swings                object
Coping_Struggles           object
Work_Interest              object
Social_Weakness            object
mental_health_interview    object
care_options               object
dtype: object

In [18]:
# Convert 'Timestamp' column to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Verify the conversion
print(df['Timestamp'].dtypes)

datetime64[ns]


In [20]:
df

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:00,Female,United States,Corporate,No,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:00,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,2015-07-27 23:25:00,Male,United States,Business,Yes,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292360,2015-08-17 09:38:00,Male,South Africa,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes
292361,2015-08-25 19:59:00,Male,United States,Business,No,Yes,No,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,2015-09-26 01:07:00,Male,United States,Business,No,Yes,Yes,15-30 days,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


In [22]:
# Display unique values in 'Days_Indoors' to check its data type
print(df['Days_Indoors'].unique())

['1-14 days' 'Go out Every day' 'More than 2 months' '15-30 days'
 '31-60 days']


In [24]:
# Map categorical ranges to representative numeric values
days_map = {
    '1-14 days': 7,
    '15-30 days': 22,
    '31-60 days': 45,
    'More than 2 months': 60,
    'Go out Every day': 0
}

# Apply the mapping to 'Days_Indoors' column
df['Days_Indoors'] = df['Days_Indoors'].map(days_map)

# Verify the conversion
print(df['Days_Indoors'].unique())

[ 7  0 60 22 45]


    • 1-14 days' is mapped to 7, representing an average of 7 days indoors.
	•	'15-30 days' is mapped to 22, representing an average of 22 days indoors.
	•	'31-60 days' is mapped to 45, representing an average of 45 days indoors.
	•	'More than 2 months' is mapped to 60, indicating the maximum of this range.
	•	'Go out Every day' is mapped to 0, indicating that the person does not stay indoors at all.

In [26]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicate rows if any exist
df.drop_duplicates(inplace=True)

Number of duplicate rows: 2313


In [28]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Confirm that duplicates are removed
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows after removal: {duplicates}")

Number of duplicate rows after removal: 0


In [30]:
df

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:00,Female,United States,Corporate,No,No,Yes,7,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:00,Female,United States,Corporate,No,Yes,Yes,7,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:00,Female,United States,Corporate,No,Yes,Yes,7,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:00,Female,United States,Corporate,No,Yes,Yes,7,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:00,Female,United States,Corporate,No,Yes,Yes,7,Yes,No,Yes,Medium,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,2015-07-27 23:25:00,Male,United States,Business,Yes,Yes,Yes,22,No,Maybe,No,Low,Yes,No,Maybe,Maybe,Not sure
292360,2015-08-17 09:38:00,Male,South Africa,Business,No,Yes,Yes,22,No,Maybe,No,Low,Yes,No,Maybe,No,Yes
292361,2015-08-25 19:59:00,Male,United States,Business,No,Yes,No,22,No,Maybe,No,Low,Yes,No,Maybe,No,No
292362,2015-09-26 01:07:00,Male,United States,Business,No,Yes,Yes,22,No,Maybe,No,Low,Yes,No,Maybe,No,Yes


Standardize Categorical Data
To standardize the text entries in categorical columns, we'll convert all text to lowercase and strip any extra whitespace

In [32]:
# Standardize categorical columns by making text lowercase and stripping extra spaces
categorical_columns = [
    'Gender', 'Country', 'Occupation', 'self_employed', 'family_history',
    'treatment', 'Growing_Stress', 'Changes_Habits', 'Mental_Health_History',
    'Mood_Swings', 'Coping_Struggles', 'Work_Interest', 'Social_Weakness',
    'mental_health_interview', 'care_options'
]

for col in categorical_columns:
    df[col] = df[col].str.lower().str.strip()

# Display unique values of a few columns to verify standardization
print(df['Gender'].unique())
print(df['Country'].unique())

['female' 'male']
['united states' 'poland' 'australia' 'canada' 'united kingdom'
 'south africa' 'sweden' 'new zealand' 'netherlands' 'india' 'belgium'
 'ireland' 'france' 'portugal' 'brazil' 'costa rica' 'russia' 'germany'
 'switzerland' 'finland' 'israel' 'italy' 'bosnia and herzegovina'
 'singapore' 'nigeria' 'croatia' 'thailand' 'denmark' 'mexico' 'greece'
 'moldova' 'colombia' 'georgia' 'czech republic' 'philippines']


In [34]:
df

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:00,female,united states,corporate,no,no,yes,7,yes,no,yes,medium,no,no,yes,no,not sure
1,2014-08-27 11:31:00,female,united states,corporate,no,yes,yes,7,yes,no,yes,medium,no,no,yes,no,no
2,2014-08-27 11:32:00,female,united states,corporate,no,yes,yes,7,yes,no,yes,medium,no,no,yes,no,yes
3,2014-08-27 11:37:00,female,united states,corporate,no,yes,yes,7,yes,no,yes,medium,no,no,yes,maybe,yes
4,2014-08-27 11:43:00,female,united states,corporate,no,yes,yes,7,yes,no,yes,medium,no,no,yes,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292359,2015-07-27 23:25:00,male,united states,business,yes,yes,yes,22,no,maybe,no,low,yes,no,maybe,maybe,not sure
292360,2015-08-17 09:38:00,male,south africa,business,no,yes,yes,22,no,maybe,no,low,yes,no,maybe,no,yes
292361,2015-08-25 19:59:00,male,united states,business,no,yes,no,22,no,maybe,no,low,yes,no,maybe,no,no
292362,2015-09-26 01:07:00,male,united states,business,no,yes,yes,22,no,maybe,no,low,yes,no,maybe,no,yes


In [36]:
# Save the cleaned dataset to a CSV file
df.to_csv('../data/cleaned_mental_health_data_TS.csv', index=False)

print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!


We’ll use the Interquartile Range (IQR) method to detect outliers in numerical columns, such as Days_Indoors. The IQR method is a common way to identify values that are unusually high or low compared to the majority of the data. 

- Purpose of the IQR Method
The IQR method is a standard approach for identifying outliers because it focuses on the spread of the middle 50% of the data. It is more robust to skewed data than other methods, like using the mean and standard deviation, which can be affected by extreme values.


In [38]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for the 'Days_Indoors' column
Q1 = df['Days_Indoors'].quantile(0.25)
Q3 = df['Days_Indoors'].quantile(0.75)
IQR = Q3 - Q1

# Determine the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['Days_Indoors'] < lower_bound) | (df['Days_Indoors'] > upper_bound)]

print(f"Number of outliers in 'Days_Indoors': {len(outliers)}")

Number of outliers in 'Days_Indoors': 0


    •	Q1 (the 25th percentile) is the value below which 25% of the data in the Days_Indoors column falls.
	•	Q3 (the 75th percentile) is the value below which 75% of the data in the Days_Indoors column falls.
These percentiles divide the data into quarters and are used to measure the spread of the data.

The Interquartile Range (IQR) is the difference between the 75th percentile and the 25th percentile. It measures the spread of the middle 50% of the data. This value is often used to detect outliers in a dataset.

These bounds help determine what values are considered outliers:
	•	Lower bound: Any data point below this value is considered an outlier.
	•	Upper bound: Any data point above this value is considered an outlier.
The factor 1.5 is commonly used in statistics to identify mild outliers. It means that any data point that is more than 1.5 times the IQR below Q1 or above Q3 is considered an outlier.

This line creates a subset of the DataFrame containing only the rows where the value in Days_Indoors is either:
	•	Less than the lower bound (potential low-end outliers).
	•	Greater than the upper bound (potential high-end outliers).