In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [64]:
df = pd.read_csv('survey_results_public.csv')
print(len(df))
df.head()

73268


Unnamed: 0,ResponseId,MainBranch,Employment,RemoteWork,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,LearnCodeCoursesCert,YearsCode,...,TimeSearching,TimeAnswering,Onboarding,ProfessionalTech,TrueFalse_1,TrueFalse_2,TrueFalse_3,SurveyLength,SurveyEase,ConvertedCompYearly
0,1,None of these,,,,,,,,,...,,,,,,,,,,
1,2,I am a developer by profession,"Employed, full-time",Fully remote,Hobby;Contribute to open-source projects,,,,,,...,,,,,,,,Too long,Difficult,
2,3,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Friend or family member...,Technical documentation;Blogs;Programming Game...,,14.0,...,,,,,,,,Appropriate in length,Neither easy nor difficult,40205.0
3,4,I am a developer by profession,"Employed, full-time",Fully remote,I don’t code outside of work,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Books / Physical media;School (i.e., Universit...",,,20.0,...,,,,,,,,Appropriate in length,Easy,215232.0
4,5,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",Hobby,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Stack Overflow;O...,,8.0,...,,,,,,,,Too long,Easy,


In [73]:
# Filter the data to only keep profesionnal developers
print(len(df))
df_filtered = df[df['MainBranch'].str.contains("I am a developer by profession")]
print(len(df_filtered))

# Also filter to only relevants countries
def filter(df, column, filters):
    return df[df[column].isin(filters)]

countries = [
    'United States of America',
    'Switzerland',
    'United Kingdom of Great Britain and Northern Ireland',
    'France',
    'Germany',
    'Netherlands',
    'Spain',
    'Italy',
    'Austria',
    'Italy',
    'Canada',
    'Ireland',
    'Sweden',
    'Norway',
    'Australia',
    'Denmark'
]

df_filtered = filter(df_filtered,'Country',countries)

# df_filtered = df_filtered[df_filtered['Country'].str.contains("America")]

selectedColumns = [
    'MentalHealth',
    'DevType',
    'OrgSize',
    'Country',
    'RemoteWork',
    'CompTotal',
    'CompFreq'
]

df_features = df_filtered[selectedColumns]
# Remove row that have NaN salary
df_features = df_features[df_features['CompTotal'].notna()]
df_features = df_features[df_features['CompFreq'].notna()]
df_features = df_features[df_features['MentalHealth'].notna()]

# Convert monthly salary to yearly, then drop the CompTotal and CompFreq columns
df_features['YearlySalary'] = df_features.apply(lambda row: float(row['CompTotal']) * 12 if row['CompFreq'] == "Monthly" else row['CompTotal'], axis=1)
df_features = df_features.drop(['CompTotal', 'CompFreq'] , axis=1)

# Filter outliers
min_yearly_salary = 0
max_yearly_salary = 1e7
df_features = df_features.loc[df_features['YearlySalary'] > min_yearly_salary]
df_features = df_features.loc[df_features['YearlySalary'] < max_yearly_salary]

print(len(df_features))

df_features

73268
53507
20261


Unnamed: 0,MentalHealth,DevType,OrgSize,Country,RemoteWork,YearlySalary
8,"Or, in your own words:","Developer, back-end",I don’t know,Netherlands,"Hybrid (some remote, some in-person)",46000.0
10,None of the above,"Developer, full-stack;Developer, back-end",100 to 499 employees,United Kingdom of Great Britain and Northern I...,"Hybrid (some remote, some in-person)",48000.0
12,None of the above,"Developer, full-stack",2 to 9 employees,United States of America,"Hybrid (some remote, some in-person)",65000.0
14,None of the above,"Developer, full-stack;Academic researcher;DevO...","5,000 to 9,999 employees",United States of America,Fully remote,110000.0
17,None of the above,"Engineer, data","1,000 to 4,999 employees",Austria,Fully remote,190000.0
...,...,...,...,...,...,...
73251,None of the above,"Developer, full-stack",20 to 99 employees,France,"Hybrid (some remote, some in-person)",65000.0
73253,None of the above,"Developer, back-end",500 to 999 employees,Australia,"Hybrid (some remote, some in-person)",121000.0
73261,None of the above,"Developer, full-stack","10,000 or more employees",France,"Hybrid (some remote, some in-person)",36000.0
73264,None of the above,Data scientist or machine learning specialist,I don’t know,United States of America,Full in-person,107000.0


In [74]:
df_features['MentalHealth'] = df_features['MentalHealth'].str.split(';')
df_features = df_features.explode(['MentalHealth'])
display(df_features['MentalHealth'].unique())

# Only keep 'anxiety' and 'depression'
mental_illnesses = ['I have an anxiety disorder', 
                     'I have a mood or emotional disorder (e.g., depression, bipolar disorder, etc.)']
df_features = filter(df_features,'MentalHealth',mental_illnesses)

array(['Or, in your own words:', 'None of the above',
       'I have a mood or emotional disorder (e.g., depression, bipolar disorder, etc.)',
       'I have an anxiety disorder',
       'I have learning differences (e.g., Dyslexic, Dyslexia, etc.)',
       'Prefer not to say',
       'I have a concentration and/or memory disorder (e.g., ADHD, etc.)',
       "I have autism / an autism spectrum disorder (e.g. Asperger's, etc.)"],
      dtype=object)

In [75]:
df_features['DevType'] = df_features['DevType'].str.split(';')
df_features = df_features.explode(['DevType'])

In [76]:
df_features

Unnamed: 0,MentalHealth,DevType,OrgSize,Country,RemoteWork,YearlySalary
31,"I have a mood or emotional disorder (e.g., dep...","Developer, full-stack","10,000 or more employees",United States of America,Fully remote,102000.0
31,I have an anxiety disorder,"Developer, full-stack","10,000 or more employees",United States of America,Fully remote,102000.0
47,I have an anxiety disorder,"Developer, full-stack","1,000 to 4,999 employees",United States of America,Fully remote,135000.0
47,I have an anxiety disorder,"Developer, back-end","1,000 to 4,999 employees",United States of America,Fully remote,135000.0
54,I have an anxiety disorder,"Developer, back-end",20 to 99 employees,Norway,"Hybrid (some remote, some in-person)",1000000.0
...,...,...,...,...,...,...
73220,I have an anxiety disorder,"Developer, full-stack",20 to 99 employees,United States of America,"Hybrid (some remote, some in-person)",210000.0
73220,I have an anxiety disorder,"Developer, desktop or enterprise applications",20 to 99 employees,United States of America,"Hybrid (some remote, some in-person)",210000.0
73220,I have an anxiety disorder,Engineering manager,20 to 99 employees,United States of America,"Hybrid (some remote, some in-person)",210000.0
73226,"I have a mood or emotional disorder (e.g., dep...","Developer, full-stack","1,000 to 4,999 employees",United States of America,Fully remote,130000.0


In [77]:
# Export the CSV
df_features.to_csv('mental_health.csv', sep=',', encoding='utf-8')