Potential resource(s):
http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html#sphx-glr-auto-examples-cluster-plot-kmeans-digits-py

# Mental Health in Tech Project

## Data Sets

[OSMI Survey on Mental Health in the Tech Workplace in 2014](https://www.kaggle.com/osmi/mental-health-in-tech-survey) 

["Ongoing" OSMI survey from 2016](https://data.world/kittybot/osmi-mental-health-tech-2016)


## Questions

What factors are most signficant in influencing whether or not a person believes disclosing a mental health issue would have negative consequences?

Can we predict, based on publicly available features of a person and company, whether that person is likely to beleive disclosing a mental health issue would be harmful for their career?

## Exploring and Cleaning 2014 Data

In [2]:
import pandas as pd

In [3]:
df14 = pd.read_csv("./datasets/2014/clean-no-dummies-2014.csv", index_col=0)
df14['year'] = '2014'
print df14.shape
df14.head(3)

(1259, 29)


Unnamed: 0,timestamp,age,gender,country,state,self_employed,family_history,treatment,work_interfere,num_employees,...,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,gender_category,year
0,2014-08-27 11:29:31,37.0,Female,United States,IL,,0,1,often,6-25,...,no,some_of_them,yes,no,maybe,yes,0,,female,2014
1,2014-08-27 11:29:37,44.0,M,United States,IN,,0,0,rarely,1000+,...,no,no,no,no,no,dont_know,0,,male,2014
2,2014-08-27 11:29:44,32.0,Male,Canada,,,0,0,rarely,6-25,...,no,yes,yes,yes,yes,no,0,,male,2014


In [4]:
df16 = pd.read_csv("./datasets/2016/clean-no-dummies-2016.csv", index_col=0)
df16['year'] = '2016'
print df16.shape
df16.head(3)

(1433, 65)


Unnamed: 0,self_employed,num_employees,tech_company,tech_role,benefits,care_options,wellness_program,seek_help,anonymity,leave,...,age,gender,live_in_country,live_in_state,work_in_country,work_in_state,position,remote_work,gender_category,year
0,0,26-100,1.0,,doesnt_apply,,no,no,dont_know,very_easy,...,39.0,Male,united_kingdom,,united_kingdom,,Back-end Developer,sometimes,male,2016
1,0,6-25,1.0,,no,yes,yes,yes,yes,somewhat_easy,...,29.0,male,united_states,illinois,united_states_of_america,illinois,Back-end Developer|Front-end Developer,never,male,2016
2,0,6-25,1.0,,no,,no,no,dont_know,neither_easy_nor_difficult,...,38.0,Male,united_kingdom,,united_kingdom,,Back-end Developer,always,male,2016


In [6]:
print df14.columns
print df16.columns
colset14 = set(df14.columns)
colset16 = set(df16.columns)
# print colset14 - colset16
# print colset16 - colset14

Index([u'timestamp', u'age', u'gender', u'country', u'state', u'self_employed',
       u'family_history', u'treatment', u'work_interfere', u'num_employees',
       u'remote_work', u'tech_company', u'benefits', u'care_options',
       u'wellness_program', u'seek_help', u'anonymity', u'leave',
       u'mental_health_consequence', u'phys_health_consequence', u'coworkers',
       u'supervisor', u'mental_health_interview', u'phys_health_interview',
       u'mental_vs_physical', u'obs_consequence', u'comments',
       u'gender_category', u'year'],
      dtype='object')
Index([u'self_employed', u'num_employees', u'tech_company', u'tech_role',
       u'benefits', u'care_options', u'wellness_program', u'seek_help',
       u'anonymity', u'leave', u'mental_health_consequence',
       u'phys_health_consequence', u'coworkers', u'supervisor',
       u'mental_vs_physical', u'obs_consequence', u'insurance',
       u'know_resources', u'revealed_contacts',
       u'revealed_contacts_consequence', u'reve

#### NaN Check


In [7]:
# print '2014 data'
# counts = df14.count()
# numrows = df14.shape[0]
# for col in df14.columns:
#     if counts[col] != numrows:
#         print "{0} has {1} NaNs".format(col, numrows-counts[col])
        
# print '\n2016 data'
# counts = df16.count()
# numrows = df16.shape[0]
# for col in df16.columns:
#     if counts[col] != numrows:
#         print "{0} has {1} NaNs".format(col, numrows-counts[col])

In [8]:
df16.head(1)

Unnamed: 0,self_employed,num_employees,tech_company,tech_role,benefits,care_options,wellness_program,seek_help,anonymity,leave,...,age,gender,live_in_country,live_in_state,work_in_country,work_in_state,position,remote_work,gender_category,year
0,0,26-100,1.0,,doesnt_apply,,no,no,dont_know,very_easy,...,39.0,Male,united_kingdom,,united_kingdom,,Back-end Developer,sometimes,male,2016


In [9]:
df14.head(1)

Unnamed: 0,timestamp,age,gender,country,state,self_employed,family_history,treatment,work_interfere,num_employees,...,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,gender_category,year
0,2014-08-27 11:29:31,37.0,Female,United States,IL,,0,1,often,6-25,...,no,some_of_them,yes,no,maybe,yes,0,,female,2014


In [10]:
df14[df14['country']=="United States"].shape[0]

751

In [11]:
usdf16 = df16[df16['work_in_country']=="united_states_of_america"]
usdf16 = usdf16[usdf16['live_in_country']=="united_states"]
# usdf16['live_in_country'].value_counts()
usdf16.shape

(837, 65)

In [12]:
usdf16.rename(columns={'work_in_country': 'country'}, inplace=True)
usdf16.shape

(837, 65)

In [13]:
usdf14 = df14[df14['country']=="United States"]
usdf14.shape

(751, 29)

In [17]:
usdf = pd.concat([usdf14, usdf16], axis=0)
usdf['country'] = "United States"
usdf['country'].value_counts(dropna=False)
print usdf.shape
usdf.head(2)


(1588, 69)


Unnamed: 0,age,anonymity,believed_conditions,benefits,care_options,comments,country,coworkers,current_disorder,diagnosed_conditions,...,tech_role,timestamp,treatment,viewed_negatively,wellness_program,work_in_state,work_interfere,work_interfere_treated,work_interfere_untreated,year
0,37.0,yes,,yes,not_sure,,United States,some_of_them,,,...,,2014-08-27 11:29:31,1,,no,,often,,,2014
1,44.0,dont_know,,dont_know,no,,United States,no,,,...,,2014-08-27 11:29:37,0,,dont_know,,rarely,,,2014


In [20]:
def work_interfere_category (row):
    if row['work_interfere'] == 'never' or row['work_interfere_treated'] == 'never' or row['work_interfere_untreated'] == 'never'  :
        return 0
    elif row['work_interfere'] == 'doesnt_apply' or row['work_interfere_treated'] == 'doesnt_apply' or row['work_interfere_untreated'] == 'doesnt_apply':
        return float('NaN')
    return 1
    

In [21]:
df = usdf.copy()
df.apply(work_interfere_category, axis=1).value_counts(dropna=False)

 1.0    1096
NaN      291
 0.0     201
dtype: int64