## 1. Import Dataset

In [31]:
# TODO: Import data set
import numpy as np
import pandas as pd
mh_df = pd.read_csv("./data/survey.csv")
mh_df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [32]:
# Check the nubmer of rows and columns
mh_df.shape

(1259, 27)

In [33]:
# Check the data types
mh_df.dtypes

Timestamp                    object
Age                           int64
Gender                       object
Country                      object
state                        object
self_employed                object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
tech_company                 object
benefits                     object
care_options                 object
wellness_program             object
seek_help                    object
anonymity                    object
leave                        object
mental_health_consequence    object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
comments                     object
dtype: object

## 2. Data cleaning

1) Change Gender values

In [34]:
# TODO: Cleaning: Male, female string matching
# Check Gender values
gender = mh_df["Gender"].value_counts()
print(gender)

Male                                              615
male                                              206
Female                                            121
M                                                 116
female                                             62
F                                                  38
m                                                  34
f                                                  15
Make                                                4
Male                                                3
Woman                                               3
Cis Male                                            2
Man                                                 2
Female (trans)                                      2
Female                                              2
Trans woman                                         1
msle                                                1
male leaning androgynous                            1
Neuter                      

In [35]:
# Change Gender names -> Male/Female/Others
others = ['A little about you', 'Agender', 'All', 'Androgyne',
          'Female (trans)', 'Genderqueer', 'Guy (-ish) ^_^','Male-ish',
          'Nah', 'Neuter', 'Trans woman', 'Trans-female','fluid',
          'male leaning androgynous','non-binary','ostensibly male', 
          'unsure what that really means', 'p', 'queer','queer/she/they', 'something kinda male?']
female = ['Cis Female','F', 'Femake', 'Female', 'Female ',
          'Female (cis)','Woman','cis-female/femme', 'f', 
          'femail', 'female','woman']
male = ['M','Mail', 'Make', 'Mal', 'Male', 'Male ', 'Male (CIS)',
       'Malr', 'Man','cis male','m','maile', 'male','msle']

# Replace gender values with Male/Female/Others
mh_df.loc[mh_df["Gender"].isin(others)==True, "new_gender"] = "Others"
mh_df.loc[mh_df["Gender"].isin(female)==True, "new_gender"] = "Female"
mh_df.loc[mh_df["Gender"].isin(male)==True, "new_gender"] = "Male"

print(mh_df["new_gender"].value_counts())

# Drop the original"Gender" row
mh_df = mh_df.drop("Gender", axis=1)
mh_df.head()

Male      987
Female    247
Others     20
Name: new_gender, dtype: int64


Unnamed: 0,Timestamp,Age,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,...,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,new_gender
0,2014-08-27 11:29:31,37,United States,IL,,No,Yes,Often,6-25,No,...,No,No,Some of them,Yes,No,Maybe,Yes,No,,Female
1,2014-08-27 11:29:37,44,United States,IN,,No,No,Rarely,More than 1000,No,...,Maybe,No,No,No,No,No,Don't know,No,,Male
2,2014-08-27 11:29:44,32,Canada,,,No,No,Rarely,6-25,No,...,No,No,Yes,Yes,Yes,Yes,No,No,,Male
3,2014-08-27 11:29:46,31,United Kingdom,,,Yes,Yes,Often,26-100,No,...,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,,Male
4,2014-08-27 11:30:22,31,United States,TX,,No,No,Never,100-500,Yes,...,No,No,Some of them,Yes,Yes,Yes,Don't know,No,,Male


2) Check "Age" values

In [48]:
print(mh_df["Age"].describe())

# Check the number of rows with negative age values
print((mh_df["Age"]<0).sum())
      
# Check the number of rows with >120 age values
print((mh_df["Age"] >120).sum())

# Drop the rows with negative or >120 age values 
mh_df = mh_df[(mh_df["Age"] >= 0)&(mh_df["Age"]<=120)]
mh_df["Age"].describe()

count    1.259000e+03
mean     7.942815e+07
std      2.818299e+09
min     -1.726000e+03
25%      2.700000e+01
50%      3.100000e+01
75%      3.600000e+01
max      1.000000e+11
Name: Age, dtype: float64
3
2


count    1254.000000
mean       32.019139
std         7.375005
min         5.000000
25%        27.000000
50%        31.000000
75%        36.000000
max        72.000000
Name: Age, dtype: float64

3) Handle missing values

In [37]:
# TODO: Cleaning: Handle missing values
# Check if missing values exist in each column
mh_df.isnull().any()

Timestamp                    False
Age                          False
Country                      False
state                         True
self_employed                 True
family_history               False
treatment                    False
work_interfere                True
no_employees                 False
remote_work                  False
tech_company                 False
benefits                     False
care_options                 False
wellness_program             False
seek_help                    False
anonymity                    False
leave                        False
mental_health_consequence    False
phys_health_consequence      False
coworkers                    False
supervisor                   False
mental_health_interview      False
phys_health_interview        False
mental_vs_physical           False
obs_consequence              False
comments                      True
new_gender                    True
dtype: bool

In [56]:
# Check the number of missing values in each column
print(mh_df["self_employed"].isnull().sum())
print(mh_df["work_interfere"].isnull().sum())
print(mh_df["comments"].isnull().sum())

18
263
1091


In [None]:
# TODO: Num rows/columns, feature names on mental health dataset

In [None]:
# TODO: Distribution of the target variable

In [None]:
# TODO: Correlation(relationship) between the target variable and the other features