## Data wrangling - 'master' dataset

In [84]:
# import numpy as np
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor  
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB

import warnings
warnings.filterwarnings("ignore")

#### Read dataset from file

In [85]:
master: DataFrame = pd.read_csv('./data/survey_results_public-2018.csv')
print(master.shape)

(98855, 129)


#### Define dictionaries to make certain verbose data into concise definitions

In [86]:
student_dict = {
    'No': 'NO', 
    'Yes, full-time': 'FULL-TIME', 
    'Yes, part-time': 'PART-TIME',
}
# student_map = pd.DataFrame(data=student_original, columns=['Original', 'Modified'])

employment_dict = {
    'Employed full-time': 'FULL-TIME', 
    'Independent contractor, freelancer, or self-employed': 'CONTRACTOR', 
    'Not employed, but looking for work': 'LOOKING', 
    'Employed part-time': 'PART-TIME', 
    'Not employed, and not looking for work': 'NOT-LOOKING', 
    'Retired': 'RETIRED',
}

education_dict = {
    'Bachelor’s degree (BA, BS, B.Eng., etc.)': 'BACHELORS', 
    'Master’s degree (MA, MS, M.Eng., MBA, etc.)': 'MASTERS', 
    'Some college/university study without earning a degree': 'NO-DEGREE', 
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 'SECONDARY', 
    'Associate degree': 'ASSOCIATE', 
    'Other doctoral degree (Ph.D, Ed.D., etc.)': 'DOCTORAL', 
    'Primary/elementary school': 'PRIMARY', 
    'Professional degree (JD, MD, etc.)': 'PROFESSIONAL', 
    'I never completed any formal education': 'NONE',
    'They never completed any formal education': 'NONE',
}

company_size_dict = {
    'Fewer than 10 employees': '<10', 
    '10 to 19 employees': '10-19', 
    '20 to 99 employees': '20-99',
    '100 to 499 employees': '100-499', 
    '500 to 999 employees': '500-999', 
    '1,000 to 4,999 employees': '1000-4999', 
    '5,000 to 9,999 employees': '5000-9999', 
    '10,000 or more employees': '>10000',
}

job_satisfaction_dict = {
    'Extremely dissatisfied': '1',
    'Moderately dissatisfied': '2',
    'Slightly dissatisfied': '3',
    'Neither satisfied nor dissatisfied': '4',
    'Slightly satisfied': '5',
    'Moderately satisfied': '6',
    'Extremely satisfied': '7',
}

last_new_job_dict = {
    "I've never had a job": '0',
    'Less than a year ago': '<1',
    'Between 1 and 2 years ago': '1-2',
    'Between 2 and 4 years ago': '2-4',
    'More than 4 years ago': '>4',
}

hours_computer_dict = {
    'Less than 1 hour': '<1',
    '1 - 4 hours': '1-4', 
    '5 - 8 hours': '5-8', 
    '9 - 12 hours': '9-12', 
    'Over 12 hours': '>12', 
}

hours_outside_dict = {
    'Less than 30 minutes': '<0.5',
    '30 - 59 minutes': '0.5-1',
    '1 - 2 hours': '1-2',
    '3 - 4 hours': '3-4',
    'Over 4 hours': '>4',
}

ai_dangerous_dict = {
    'Algorithms making important decisions': 'IMP-DECISIONS',
    'Artificial intelligence surpassing human intelligence ("the singularity")': 'SINGULARITY',
    'Evolving definitions of "fairness" in algorithmic versus human decisions': 'FAIRNESS-DEFINITION',
    'Increasing automation of jobs': 'AUTOMATION',
}

ai_responsible_dict = {
    'The developers or the people creating the AI': 'DEVELOPERS',
    'A governmental or other regulatory body': 'GOVT',
    'Prominent industry leaders': 'LEADERS',
    'Nobody': 'NOBODY',
}

ai_future_dict = {
    "I'm excited about the possibilities more than worried about the dangers.": 'EXCITED',
    "I'm worried about the dangers more than I'm excited about the possibilities.": 'WORRIED',
    "I don't care about it, or I haven't thought about it.": 'NO-COMMENTS',
}

exercise_dict = {
    "1 - 2 times per week" : '2',        
    "3 - 4 times per week" : '4',        
    'Daily or almost every day': '7',
    "I don't typically exercise" : '0',
}

#### Define function to identify if the undergrad course was related to computer science

In [87]:
courses = ['computer', 'software', 'information']
def has_software(_undergrad: str) -> bool:
    if _undergrad is None:
        return False
    return any(ext in _undergrad.lower() for ext in courses)

#### Extract the important columns into a separate dataset

In [88]:
imp_cols = ['Student', 'FormalEducation', 'EducationParents', 'UndergradMajor',
        'Employment', 'CompanySize', 'JobSatisfaction', 'CareerSatisfaction', 'LastNewJob',
        'HoursComputer', 'HoursOutside',
        'AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture','Exercise','ConvertedSalary']
master_imp: DataFrame = master[imp_cols]

In [89]:
master_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 17 columns):
Student               94901 non-null object
FormalEducation       94703 non-null object
EducationParents      61813 non-null object
UndergradMajor        79036 non-null object
Employment            95321 non-null object
CompanySize           71531 non-null object
JobSatisfaction       69276 non-null object
CareerSatisfaction    76504 non-null object
LastNewJob            78889 non-null object
HoursComputer         72133 non-null object
HoursOutside          72024 non-null object
AIDangerous           63115 non-null object
AIInteresting         65367 non-null object
AIResponsible         65553 non-null object
AIFuture              69728 non-null object
Exercise              72108 non-null object
ConvertedSalary       47702 non-null float64
dtypes: float64(1), object(16)
memory usage: 6.8+ MB


In [90]:
null_counts = master_imp.isnull().sum().sort_values(ascending = False)
null_counts

ConvertedSalary       51153
EducationParents      37042
AIDangerous           35740
AIInteresting         33488
AIResponsible         33302
JobSatisfaction       29579
AIFuture              29127
CompanySize           27324
HoursOutside          26831
Exercise              26747
HoursComputer         26722
CareerSatisfaction    22351
LastNewJob            19966
UndergradMajor        19819
FormalEducation        4152
Student                3954
Employment             3534
dtype: int64

#### Handle missing values
- if the column is numeric, then impute with the mean for that column.
- if the # of nulls is > 10,000, then impute with a new value 'x'
- if the # of nulls is < 10,000, then impute with the mode for that column.

In [91]:
# master_imp = master_imp.replace(np.nan, '', regex=True)
num_cols = master_imp.select_dtypes(include=['float','int']).columns

for col in imp_cols:
    if col in num_cols:
        master_imp[col].fillna(master_imp[col].mean(), inplace=True)
    elif master_imp[col].isnull().sum() > 10000:
        master_imp[col].fillna('x', inplace=True)
    else:
        master_imp[col].fillna(master_imp[col].mode()[0], inplace=True)

#### Replace the verbose definitions with the concise data dictionary values

In [92]:
master_imp = master_imp.replace({
    'Student': student_dict,
    'Employment': employment_dict,
    'FormalEducation': education_dict, 
    'CompanySize': company_size_dict,
    'JobSatisfaction': job_satisfaction_dict,
    'CareerSatisfaction': job_satisfaction_dict,
    'LastNewJob': last_new_job_dict,
    'HoursComputer': hours_computer_dict,
    'HoursOutside': hours_outside_dict,
    'EducationParents': education_dict,
    'AIDangerous': ai_dangerous_dict,
    'AIInteresting': ai_dangerous_dict,
    'AIResponsible': ai_responsible_dict,
    'AIFuture': ai_future_dict,
    'Exercise': exercise_dict,
})

master_imp['Studied_Software'] = master_imp['UndergradMajor'].map(has_software)
master_imp.drop(columns=['UndergradMajor'], inplace=True)

#### Datatype conversion

In [93]:
categorical_cols = master_imp.select_dtypes(include=['object']).columns.tolist()
categorical_cols.append('Studied_Software')

for col in categorical_cols:
    master_imp[col] = pd.Categorical(master_imp[col])

master_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 17 columns):
Student               98855 non-null category
FormalEducation       98855 non-null category
EducationParents      98855 non-null category
Employment            98855 non-null category
CompanySize           98855 non-null category
JobSatisfaction       98855 non-null category
CareerSatisfaction    98855 non-null category
LastNewJob            98855 non-null category
HoursComputer         98855 non-null category
HoursOutside          98855 non-null category
AIDangerous           98855 non-null category
AIInteresting         98855 non-null category
AIResponsible         98855 non-null category
AIFuture              98855 non-null category
Exercise              98855 non-null category
ConvertedSalary       98855 non-null float64
Studied_Software      98855 non-null category
dtypes: category(16), float64(1)
memory usage: 2.3 MB


#### Dummy coding

In [94]:
y_col = ['JobSatisfaction']
x_col = [x for x in categorical_cols if x != 'JobSatisfaction']

master_imp = pd.get_dummies(master_imp, columns=x_col)
master_imp.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98855 entries, 0 to 98854
Data columns (total 91 columns):
JobSatisfaction                      98855 non-null category
ConvertedSalary                      98855 non-null float64
Student_FULL-TIME                    98855 non-null uint8
Student_NO                           98855 non-null uint8
Student_PART-TIME                    98855 non-null uint8
FormalEducation_ASSOCIATE            98855 non-null uint8
FormalEducation_BACHELORS            98855 non-null uint8
FormalEducation_DOCTORAL             98855 non-null uint8
FormalEducation_MASTERS              98855 non-null uint8
FormalEducation_NO-DEGREE            98855 non-null uint8
FormalEducation_NONE                 98855 non-null uint8
FormalEducation_PRIMARY              98855 non-null uint8
FormalEducation_PROFESSIONAL         98855 non-null uint8
FormalEducation_SECONDARY            98855 non-null uint8
EducationParents_ASSOCIATE           98855 non-null uint8
EducationParents

#### Store as pickle file

In [95]:
master_imp.to_pickle('./data/master.pickle')
print('DataFrame stored in pickle file...')

DataFrame stored in pickle file...


In [96]:
master_2: DataFrame = pd.read_pickle('./data/master.pickle')
print(master_2.shape)

(98855, 91)


In [98]:
X = master_2.drop('JobSatisfaction', axis=1)
y = master_2[['JobSatisfaction']]
# y = pd.get_dummies(y)

In [99]:
from sklearn import tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# model = tree.DecisionTreeClassifier()
# model.fit(X_train, y_train) 
# y_pred = model.predict(X_test)

model_nb = BernoulliNB()
model_nb.fit(X_train, y_train) 
y_pred = model_nb.predict(X_test)
