## Data wrangling - 'master' dataset

In [1]:
# import numpy as np
import pandas as pd
from pandas import DataFrame

import warnings
warnings.filterwarnings("ignore")

#### Read dataset from file

In [7]:
# master: DataFrame = pd.read_csv('./data/survey_results_public-2018.csv')
master: DataFrame = pd.read_csv('./data/survey_results_public-2018.csv')
print(master.shape)

(98855, 129)


#### Define dictionaries to make certain verbose data into concise definitions

In [8]:
student_dict = {
    'No': 'NO', 
    'Yes, full-time': 'FULL-TIME', 
    'Yes, part-time': 'PART-TIME',
}
# student_map = pd.DataFrame(data=student_original, columns=['Original', 'Modified'])

employment_dict = {
    'Employed full-time': 'FULL-TIME', 
    'Independent contractor, freelancer, or self-employed': 'CONTRACTOR', 
    'Not employed, but looking for work': 'LOOKING', 
    'Employed part-time': 'PART-TIME', 
    'Not employed, and not looking for work': 'NOT-LOOKING', 
    'Retired': 'RETIRED',
}

education_dict = {
    'Bachelor’s degree (BA, BS, B.Eng., etc.)': 'BACHELORS', 
    'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 'MASTERS', 
    'Some college/university study without earning a degree': 'NO-DEGREE', 
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 'SECONDARY', 
    'Associate degree': 'ASSOCIATE', 
    'Other doctoral degree (Ph.D, Ed.D., etc.)': 'DOCTORAL', 
    'Primary/elementary school': 'PRIMARY', 
    'Professional degree (JD, MD, etc.)': 'PROFESSIONAL', 
    'I never completed any formal education': 'NONE',
    'They never completed any formal education': 'NONE'
}

company_size_dict = {
    'Fewer than 10 employees': '<10', 
    '10 to 19 employees': '10-19', 
    '20 to 99 employees': '20-99',
    '100 to 499 employees': '100-499', 
    '500 to 999 employees': '500-999', 
    '1,000 to 4,999 employees': '1000-4999', 
    '5,000 to 9,999 employees': '5000-9999', 
    '10,000 or more employees': '>10000',
}

job_satisfaction_dict = {
    'Extremely dissatisfied': '1', 
    'Moderately dissatisfied': '2', 
    'Slightly dissatisfied': '3', 
    'Neither satisfied nor dissatisfied': '4', 
    'Slightly satisfied': '5', 
    'Moderately satisfied': '6', 
    'Extremely satisfied': '7',
}

last_new_job_dict = {
    "I've never had a job": '0',
    'Less than a year ago': '<1', 
    'Between 1 and 2 years ago': '1-2', 
    'Between 2 and 4 years ago': '2-4', 
    'More than 4 years ago': '>4', 
}

hours_computer_dict = {
    'Less than 1 hour': '<1',
    '1 - 4 hours': '1-4', 
    '5 - 8 hours': '5-8', 
    '9 - 12 hours': '9-12', 
    'Over 12 hours': '>12', 
}

hours_outside_dict = {
    'Less than 30 minutes': '<0.5',
    '30 - 59 minutes': '0.5-1',
    '1 - 2 hours': '1-2',
    '3 - 4 hours': '3-4',
    'Over 4 hours': '>4'
}

ai_dangerous_dict = {
    'Algorithms making important decisions': 'IMP-DECISIONS',
    'Artificial intelligence surpassing human intelligence ("the singularity")': 'SINGULARITY',
    'Evolving definitions of "fairness" in algorithmic versus human decisions': 'FAIRNESS-DEFINITION',
    'Increasing automation of jobs': 'AUTOMATION'
}

ai_responsible_dict = {
    'The developers or the people creating the AI': 'DEVELOPERS',
    'A governmental or other regulatory body': 'GOVT',
    'Prominent industry leaders': 'LEADERS',
    'Nobody': 'NOBODY'
}

ai_future_dict = {
    "I'm excited about the possibilities more than worried about the dangers.": 'EXCITED',
    "I'm worried about the dangers more than I'm excited about the possibilities.": 'WORRIED',
    "I don't care about it, or I haven't thought about it.": 'NO-COMMENTS'
}

#### Define function to identify if the undergrad course was related to computer science

In [9]:
courses = ['computer', 'software', 'information']
def has_software(_undergrad: str) -> bool:
    if _undergrad is None:
        return False
    return any(ext in _undergrad.lower() for ext in courses)

#### Extract the important columns into a separate dataset

In [6]:
imp_cols = ['Student', 'FormalEducation', 'EducationParents', 'UndergradMajor',
        'Employment', 'CompanySize', 'JobSatisfaction', 'CareerSatisfaction', 'LastNewJob',
        'HoursComputer', 'HoursOutside',
        'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture']
master_imp: DataFrame = master[imp_cols]

In [10]:
null_counts = master_imp.isnull().sum().sort_values(ascending = False)
null_counts

EducationParents      37042
AIDangerous           35740
AIInteresting         33488
AIResponsible         33302
JobSatisfaction       29579
AIFuture              29127
CompanySize           27324
HoursOutside          26831
HoursComputer         26722
CareerSatisfaction    22351
LastNewJob            19966
UndergradMajor        19819
FormalEducation        4152
Student                3954
Employment             3534
dtype: int64

#### Handle missing values
- if the # of nulls is > 10,000, then impute with a new value 'x'
- if the # of nulls is < 10,000, then impute with the mode for that column.

In [11]:
# master_imp = master_imp.replace(np.nan, '', regex=True)

for col in imp_cols:
    if master_imp[col].isnull().sum() > 10000:
        master_imp[col].fillna('x', inplace=True)
    else:
        master_imp[col].fillna(master_imp[col].mode()[0], inplace=True)

#### Replace the verbose definitions with the concise data dictionary values

In [12]:
master_imp = master_imp.replace({
    'Student': student_dict,
    'Employment': employment_dict,
    'FormalEducation': education_dict, 
    'CompanySize': company_size_dict,
    'JobSatisfaction': job_satisfaction_dict,
    'CareerSatisfaction': job_satisfaction_dict,
    'LastNewJob': last_new_job_dict,
    'HoursComputer': hours_computer_dict,
    'HoursOutside': hours_outside_dict,
    'EducationParents': education_dict,
    'AIDangerous': ai_dangerous_dict,
    'AIInteresting': ai_dangerous_dict,
    'AIResponsible': ai_responsible_dict,
    'AIFuture': ai_future_dict,
})

master_imp['Studied_Software'] = master_imp['UndergradMajor'].map(has_software)
master_imp.drop(columns=['UndergradMajor'], inplace=True)

#### Datatype conversion

In [13]:
categorical_cols = imp_cols[:]
categorical_cols.remove('UndergradMajor')
categorical_cols.append('Studied_Software')
for col in categorical_cols:
    master_imp[col] = pd.Categorical(master_imp[col])

master_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 15 columns):
Student               98855 non-null category
FormalEducation       98855 non-null category
EducationParents      98855 non-null category
Employment            98855 non-null category
CompanySize           98855 non-null category
JobSatisfaction       98855 non-null category
CareerSatisfaction    98855 non-null category
LastNewJob            98855 non-null category
HoursComputer         98855 non-null category
HoursOutside          98855 non-null category
AIDangerous           98855 non-null category
AIInteresting         98855 non-null category
AIResponsible         98855 non-null category
AIFuture              98855 non-null category
Studied_Software      98855 non-null category
dtypes: category(15)
memory usage: 1.4 MB


#### Dummy coding

In [14]:
for col in categorical_cols:
    col_dummies = pd.get_dummies(pd.DataFrame({col: master_imp[col]}))
    master_imp = pd.concat([master_imp, col_dummies], axis=1)

In [15]:
master_imp.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 107 columns):
Student                              category
FormalEducation                      category
EducationParents                     category
Employment                           category
CompanySize                          category
JobSatisfaction                      category
CareerSatisfaction                   category
LastNewJob                           category
HoursComputer                        category
HoursOutside                         category
AIDangerous                          category
AIInteresting                        category
AIResponsible                        category
AIFuture                             category
Studied_Software                     category
Student_FULL-TIME                    uint8
Student_NO                           uint8
Student_PART-TIME                    uint8
FormalEducation_ASSOCIATE            uint8
FormalEducation_BACHELORS   

In [16]:
master_imp.to_pickle('./data/master.pickle')
print('DataFrame stored in pickle file...')


DataFrame stored in pickle file...
