#### Import packages

In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import pandas as pd
import numpy as np
import pickle

#### Read data

In [2]:
data2014 = pd.read_csv('Data/Data2014.csv',sep=',',na_values=['','N/A','Not eligible for coverage / N/A','NA'])
data2016 = pd.read_csv('Data/Data2016.csv',sep=',',na_values=['','N/A','Not eligible for coverage / N/A','NA'])

#### Concatenate data

In [3]:
features2014 = ['Age','anonymity','benefits','care_options','coworkers','family_history','Gender','leave',
                'mental_health_consequence','mental_health_interview','mental_vs_physical','no_employees','obs_consequence',
                'phys_health_consequence','phys_health_interview','remote_work','seek_help','self_employed','supervisor',
                'tech_company','treatment','wellness_program','work_interfere']
features2016 = ['What is your age?',
                'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?',
                'Does your employer provide mental health benefits as part of healthcare coverage?',
                'Do you know the options for mental health care available under your employer-provided coverage?',
                'Would you feel comfortable discussing a mental health disorder with your coworkers?',
                'Do you have a family history of mental illness?',
                'What is your gender?',
                'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:',
                'Do you think that discussing a mental health disorder with your employer would have negative consequences?',
                'Would you bring up a mental health issue with a potential employer in an interview?',
                'Do you feel that your employer takes mental health as seriously as physical health?',
                'How many employees does your company or organization have?',
                'Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?',
                'Do you think that discussing a physical health issue with your employer would have negative consequences?',
                'Would you be willing to bring up a physical health issue with a potential employer in an interview?',
                'Do you work remotely?',
                'Do you know local or online resources to seek help for a mental health disorder?',
                'Are you self-employed?',
                'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?',
                'Is your employer primarily a tech company/organization?',
                'Have you ever sought treatment for a mental health issue from a mental health professional?',
                'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
                'If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?']
data2014 = data2014[features2014]
data2016 = data2016[features2016]
data2016.columns = features2014
data = pd.concat([data2014,data2016],axis=0).reset_index(drop=True)

In [4]:
data.shape
data.T

(2692, 23)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2682,2683,2684,2685,2686,2687,2688,2689,2690,2691
Age,37,44,32,31,31,33,35,39,42,23,...,24,22,24,26,38,34,56,52,30,25
anonymity,Yes,Don't know,Don't know,No,Don't know,Don't know,No,Yes,No,Don't know,...,I don't know,I don't know,Yes,I don't know,Yes,,,I don't know,I don't know,I don't know
benefits,Yes,Don't know,No,No,Yes,Yes,No,No,Yes,Don't know,...,Yes,,Yes,I don't know,Yes,,,Yes,I don't know,Yes
care_options,Not sure,No,No,Yes,No,Not sure,No,Yes,Yes,No,...,I am not sure,,I am not sure,I am not sure,No,,,Yes,I am not sure,No
coworkers,Some of them,No,Yes,Some of them,Some of them,Yes,Some of them,No,Yes,Yes,...,No,Maybe,Maybe,Maybe,Yes,,,Yes,Maybe,Maybe
family_history,No,No,No,Yes,No,Yes,Yes,No,Yes,No,...,No,I don't know,I don't know,I don't know,Yes,Yes,Yes,Yes,Yes,I don't know
Gender,Female,M,Male,Male,Male,Male,Female,M,Female,Male,...,Male,female,Male,Female,female,Female,MALE,Male,Female,non-binary
leave,Somewhat easy,Don't know,Somewhat difficult,Somewhat difficult,Don't know,Don't know,Somewhat difficult,Don't know,Very difficult,Don't know,...,Somewhat easy,Somewhat difficult,Somewhat easy,Somewhat easy,Somewhat easy,,,Somewhat difficult,Somewhat difficult,Very difficult
mental_health_consequence,No,Maybe,No,Yes,No,No,Maybe,No,Maybe,No,...,No,Maybe,No,Maybe,No,,,Maybe,Maybe,Maybe
mental_health_interview,No,No,Yes,Maybe,Yes,No,No,No,No,Maybe,...,No,Maybe,No,Maybe,No,No,No,No,No,No


#### Number of unique values by features

In [5]:
data.nunique()

Age                          64
anonymity                     4
benefits                      4
care_options                  4
coworkers                     4
family_history                3
Gender                       99
leave                         7
mental_health_consequence     3
mental_health_interview       3
mental_vs_physical            4
no_employees                  6
obs_consequence               2
phys_health_consequence       3
phys_health_interview         3
remote_work                   5
seek_help                     6
self_employed                 4
supervisor                    4
tech_company                  4
treatment                     4
wellness_program              4
work_interfere                5
dtype: int64

#### Unique values by features

In [6]:
for feature in data.columns:
    print(feature,data[feature].unique(),'\n')

Age [         37          44          32          31          33          35
          39          42          23          29          36          27
          46          41          34          30          40          38
          50          24          18          28          26          22
          19          25          45          21         -29          43
          56          60          54         329          55 99999999999
          48          20          57          58          47          62
          51          65          49       -1726           5          53
          61           8          11          -1          72          52
          17          63          99         323           3          66
          59          15          74          70] 

anonymity ['Yes' "Don't know" 'No' "I don't know" nan] 

benefits ['Yes' "Don't know" 'No' nan "I don't know"] 

care_options ['Not sure' 'No' 'Yes' nan 'I am not sure'] 

coworkers ['Some of them' 'No' 'Yes' 'Mayb

#### Initial features preprocessing
- Numerical features: *Age* below 15 years and above 75 as missing values
- Categorical features: grouping of values

In [7]:
# Age
data.loc[data['Age']<15,'Age'] = np.nan
data.loc[data['Age']>75,'Age'] = np.nan
# anonymity
data.loc[data['anonymity'].isin(["I don't know","Don't know"]),'anonymity'] = 'Unknown'
# benefits
data.loc[data['benefits'].isin(["I don't know","Don't know"]),'benefits'] = 'Unknown'
# care_options
data.loc[data['care_options'].isin(['I am not sure','Not sure']),'care_options'] = 'Unknown'
# coworkers
data.loc[data['coworkers'].isin(['Some of them','Yes']),'coworkers'] = 'Yes'
# family_history
data.loc[data['family_history'].isin(["I don't know"]),'family_history'] = 'Unknown'
# Gender
female_levels = ['Female','Woman','female','Trans-female','Cis Female','F','f','queer/she/they','Femake','woman','Female ',
                 'cis-female/femme','Trans woman','Female (trans)','Female (cis)','p','femail','I identify as female.',
                 'female ','Female assigned at birth ','fm','Cis female ','Genderfluid (born female)',
                 'Female or Multi-Gender Femme','female/woman','Cisgender Female','genderqueer woman','fem',
                 'Female (props for making this a freeform field, though)',' Female','Cis-woman','Genderflux demi-girl',
                 'female-bodied; no feelings about gender']
male_levels = ['M','MALE','Male','male','m','Male-ish','maile','something kinda male?','Cis Male','Mal','Male (CIS)',
               'Guy (-ish) ^_^','male leaning androgynous','Male ','Man','msle','Mail','cis male','Malr','Cis Man',
               'ostensibly male, unsure what that really means','man','Cis male','Male.','male 9:1 female, roughly','Male (cis)',
               'nb masculine','Sex is male',"I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
               'mail','M|','Male/genderqueer','male ','Male (trans, FtM)','cis man','Dude','cisdude']
homosexual_levels = ['non-binary','Make','Nah','All','fluid','Enby','Genderqueer','Androgyne','Agender','Neuter','queer',
                     'A little about you','Bigender','Transitioned, M2F','Other/Transfeminine','Androgynous','Other',
                     'none of your business','genderqueer','Human','Genderfluid','mtf','Queer','Fluid','Nonbinary','human',
                     'Unicorn','AFAB','Transgender woman']
data.loc[data['Gender'].isin(female_levels),'Gender'] = 'Female'
data.loc[data['Gender'].isin(male_levels),'Gender'] = 'Male'
data.loc[data['Gender'].isin(homosexual_levels),'Gender'] = 'Homosexual'
# leave
data.loc[data['leave'].isin(["I don't know","Don't know",'Neither easy nor difficult']),'leave'] = 'Unknown'
data.loc[data['leave'].isin(['Somewhat easy','Very easy']),'leave'] = 'Easy'
data.loc[data['leave'].isin(['Somewhat difficult','Very difficult']),'leave'] = 'Difficult'
# mental_vs_physical
data.loc[data['mental_vs_physical'].isin(["I don't know","Don't know"]),'mental_vs_physical'] = 'Unknown'
# no_employees
data.loc[data['no_employees'].isin(['6-25','1-5']),'no_employees'] = '1-25'
data.loc[data['no_employees'].isin(['100-500','500-1000']),'no_employees'] = '100-1000'
# remote_work
data.loc[data['remote_work'].isin(['Sometimes','Yes','Always']),'remote_work'] = 'Yes'
data.loc[data['remote_work'].isin(['No','Never']),'remote_work'] = 'No'
# seek_help
data.loc[data['seek_help'].isin(["No, I don't know any",'No']),'seek_help'] = 'No'
data.loc[data['seek_help'].isin(["Don't know"]),'seek_help'] = 'Unknown'
data.loc[data['seek_help'].isin(['Yes','Yes, I know several','I know some']),'seek_help'] = 'Yes'
# supervisor
data.loc[data['supervisor'].isin(['Yes','Some of them']),'supervisor'] = 'Yes'
# self_employed
data.loc[data['self_employed']==0,'self_employed'] = 'No'
data.loc[data['self_employed']==1,'self_employed'] = 'Yes'
# tech_company
data.loc[data['tech_company']==0,'tech_company'] = 'No'
data.loc[data['tech_company']==1,'tech_company'] = 'Yes'
# treatment
data.loc[data['treatment'].isin(['No']),'treatment'] = 0
data.loc[data['treatment'].isin(['Yes']),'treatment'] = 1
# wellness_program
data.loc[data['wellness_program'].isin(["I don't know","Don't know"]),'wellness_program'] = 'Unknown'
# work_interfere
data.loc[data['work_interfere'].isin(['Not applicable to me']),'work_interfere'] = 'Unknown'
data.loc[data['work_interfere'].isin(['Often','Sometimes']),'work_interfere'] = 'Yes'
data.loc[data['work_interfere'].isin(['Rarely','Never']),'work_interfere'] = 'No'

#### Missing values
- Numerical features: replace by mean
- Categorical features: replace by 'Missing'

*Note*: Save a dictionary with the imputation value of each feature.

In [8]:
data.isnull().sum(axis=0)

Age                            11
anonymity                     287
benefits                      370
care_options                  420
coworkers                     287
family_history                  0
Gender                          3
leave                         287
mental_health_consequence     287
mental_health_interview         0
mental_vs_physical            287
no_employees                  287
obs_consequence               287
phys_health_consequence       287
phys_health_interview           0
remote_work                     0
seek_help                    1146
self_employed                  18
supervisor                    287
tech_company                  287
treatment                       0
wellness_program              287
work_interfere                264
dtype: int64

In [9]:
target = 'treatment'
missings = {}
for feature in data.columns:
    if data[feature].dtypes in ['int','int32','int64','float','float32','float64'] and feature!=target:
        missings[feature] = data[feature].mean()
        data[feature] = data[feature].replace(np.nan,missings[feature])
    elif feature!=target:
        missings[feature] = 'Missing'
        data[feature] = data[feature].replace(np.nan,missings[feature])

#### Convert categorical features to category type

In [10]:
data.dtypes

Age                          float64
anonymity                     object
benefits                      object
care_options                  object
coworkers                     object
family_history                object
Gender                        object
leave                         object
mental_health_consequence     object
mental_health_interview       object
mental_vs_physical            object
no_employees                  object
obs_consequence               object
phys_health_consequence       object
phys_health_interview         object
remote_work                   object
seek_help                     object
self_employed                 object
supervisor                    object
tech_company                  object
treatment                      int64
wellness_program              object
work_interfere                object
dtype: object

In [11]:
for feature in data.columns:
    if data[feature].dtypes not in ['int','int32','int64','float','float32','float64'] and feature!=target:
        data[feature] = data[feature].astype('category')

#### Save preprocessed data
*Note*: Save preprocessed data before normalization and one hot encoding, we will use it in the notebook *5. Predicting a new instance*.

In [12]:
pickle.dump([data],open('Data/data.dat','wb'))

#### Normalization of numerical features

*Note:* Save a dictionary with the mean and standard deviation used for the normalization of each feature.

In [13]:
normalization = {}
for feature in data.columns:
    if data[feature].dtypes in ['int','int32','int64','float','float32','float64'] and feature!=target:
        mean = data[feature].mean()
        std = data[feature].std()
        data[feature] = data[feature].apply(lambda x: (x-mean)/(std))
        normalization[feature] = (mean,std)

#### One hot encoding of categorical features

Note: Save a dictionary with the levels or values of each feature.

In [14]:
one_hot_encoding = {}
for feature in data.columns:
    if str(data[feature].dtypes)=='category' and feature!=target:
        one_hot_encoding[feature] = data[feature].cat.categories
        for level in one_hot_encoding[feature]:
            data[feature+'_'+level] = (data[feature]==level).astype('int')
        data = data.drop(columns=feature)

In [15]:
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2682,2683,2684,2685,2686,2687,2688,2689,2690,2691
Age,0.49662,1.396067,-0.145842,-0.274334,-0.274334,-0.01735,0.239635,0.753605,1.139082,-1.302274,...,-1.173781,-1.430766,-1.173781,-0.916797,0.625113,0.111143,2.937976,2.424007,-0.402827,-1.045289
treatment,1.00000,0.000000,0.000000,1.000000,0.000000,0.00000,1.000000,0.000000,1.000000,0.000000,...,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000
anonymity_Missing,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
anonymity_No,0.00000,0.000000,0.000000,1.000000,0.000000,0.00000,1.000000,0.000000,1.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
anonymity_Unknown,0.00000,1.000000,1.000000,0.000000,1.000000,1.00000,0.000000,0.000000,0.000000,1.000000,...,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000
anonymity_Yes,1.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,1.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000
benefits_Missing,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000
benefits_No,0.00000,0.000000,1.000000,1.000000,0.000000,0.00000,1.000000,1.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
benefits_Unknown,0.00000,1.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,1.000000,...,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000
benefits_Yes,1.00000,0.000000,0.000000,0.000000,1.000000,1.00000,0.000000,0.000000,1.000000,0.000000,...,1.000000,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000,1.000000,0.000000,1.000000


#### Save preprocessing dictionaries
- missings
- normalization
- one_hot_encoding

In [16]:
pickle.dump([missings,normalization,one_hot_encoding],open('Data/preprocessing_dictionaries.dat','wb'))

#### Split data
- 50% in *train* for modelling 1st stacking layer
- 25% in *validation* for validating 1st stacking layer models and modelling 2nd using the predictions of 1st stacking layer as features
- 25% in *test* for validating 2nd stacking layers models

*Note:* We will use balanced splitting for getting same binary target distribution (same % of 1) in each split.

In [17]:
def split_balanced(indexes,target,fractions=[0.5,0.25,0.25],seed=1):
    indexes = np.array(indexes)
    labels = np.unique(target)
    fractions = np.cumsum(fractions)
    train_index = np.array([])
    validation_index = np.array([])
    test_index = np.array([])
    for label in labels:
        np.random.seed(seed)
        label_indexes = np.random.permutation(indexes[target==label])
        n = len(label_indexes)
        n_train = int(n*fractions[0])
        n_validation = int(n*fractions[1])
        train_index = np.concatenate((train_index,label_indexes[:n_train]))
        validation_index = np.concatenate((validation_index,label_indexes[n_train:n_validation]))
        test_index = np.concatenate((test_index,label_indexes[n_validation:]))
    return(train_index,validation_index,test_index)

train_index,validation_index,test_index = split_balanced(data.index,data['treatment'],[0.5,0.25,0.25],1)
train,validation,test = data.loc[train_index],data.loc[validation_index],data.loc[test_index]

#### Save preprocessed data splits

In [18]:
pickle.dump([train,validation,test],open('Data/data_splits.dat','wb'))