In [257]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import math
from plotnine import *

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

import scipy.cluster.hierarchy as sch
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors

In [258]:
fields = ['GenderSelect', 'Country', 'Age', 'EmploymentStatus', 'CodeWriter', 'StudentStatus', 'CurrentJobTitleSelect', 'LanguageRecommendationSelect', 'LearningDataScienceTime', 'TimeSpentStudying', 'FormalEducation', 'CompensationAmount', 'JobHuntTime', 'EmployerSearchMethod']
df = pd.read_csv("Datasets/multipleChoiceResponses.csv", usecols = fields , encoding = 'latin-1')

df.head()

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,CodeWriter,CurrentJobTitleSelect,LanguageRecommendationSelect,LearningDataScienceTime,TimeSpentStudying,FormalEducation,EmployerSearchMethod,CompensationAmount,JobHuntTime
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,Yes,DBA/Database Engineer,F#,,,Bachelor's degree,I visited the company's Web site and found a j...,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,Python,1-2 years,2 - 10 hours,Master's degree,,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,R,1-2 years,2 - 10 hours,Master's degree,,,1-2
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,Yes,Operations Research Practitioner,Python,,,Master's degree,,250000.0,
4,Male,Taiwan,38.0,Employed full-time,,Yes,Computer Scientist,Python,,,Doctoral degree,A tech-specific job board,,


In [259]:
df.shape

(16716, 14)

In [260]:
# Data Cleanup

# Simplify data in GenderSelect
df = df[df['GenderSelect'].isin(['Male', 'Female'])]

# Only United States Respondents
df = df[df['Country'] == 'United States']

# Simplify EmploymentStatus, if employed: 1, else: 0
df['isEmployed'] = df['EmploymentStatus'].apply(lambda x: 1 if any(s in x for s in ['full-time', 'freelancer']) else 0)

# Convert StudentStatus null values to 0
df['StudentStatus'] = df['StudentStatus'].astype(str).apply(lambda x: 1 if 'Yes' in x else 0)

# Only show those who have a job in analyzing data, software or programming
jobTypes = ['Data', 'Software', 'Computer', 'Database', 'Business Analyst', 'Machine Learning', 'Programmer']
df['CodeWriter'] = df['CodeWriter'].apply(lambda x: 1 if x == 'Yes' else 0)
df = df[df['CurrentJobTitleSelect'].isin(jobTypes) | df['CodeWriter'] == 1]

# Factorize FormalEducation
#df['FormalEducation'], uniques = pd.factorize(df['FormalEducation'])

In [262]:
# Factorize FormalEducation
df['FormalEducation'], uniques = pd.factorize(df['FormalEducation'])

# Factorize EmployerSearchMethod
df['EmployerSearchMethod'], uniques = pd.factorize(df['EmployerSearchMethod'])

In [263]:
df.head(30)

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,CodeWriter,CurrentJobTitleSelect,LanguageRecommendationSelect,LearningDataScienceTime,TimeSpentStudying,FormalEducation,EmployerSearchMethod,CompensationAmount,JobHuntTime,isEmployed
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",0,1,Operations Research Practitioner,Python,,,0,,250000.0,,1
6,Male,United States,35.0,Employed full-time,0,1,Computer Scientist,R,,,0,A career fair or on-campus recruiting event,,,1
15,Male,United States,58.0,"Independent contractor, freelancer, or self-em...",0,1,DBA/Database Engineer,R,,,0,,120000.0,,1
21,Male,United States,25.0,Employed part-time,0,1,Researcher,Python,,,1,Some other way,20000.0,,0
22,Male,United States,33.0,Employed full-time,0,1,Scientist/Researcher,Matlab,,,2,"A friend, family member, or former colleague t...",100000.0,,1
24,Male,United States,,Employed full-time,0,1,Software Developer/Software Engineer,Matlab,,,0,An external recruiter or headhunter,,,1
34,Male,United States,35.0,Employed full-time,0,1,Engineer,Python,,,2,"A friend, family member, or former colleague t...",133000.0,,1
39,Male,United States,43.0,Retired,0,1,Software Developer/Software Engineer,Python,,,2,,,,0
40,Male,United States,33.0,Employed full-time,0,1,Operations Research Practitioner,,,,2,I was contacted directly by someone at the com...,,,1
41,Male,United States,30.0,Employed full-time,0,1,Data Scientist,Python,,,0,An external recruiter or headhunter,,,1
