Remove outliers Function

In [None]:
import numpy as np

def remove_outliers_iqr(data):
    # ini Q1 dan Q3
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    
    # ini interquartile range (IQR)
    IQR = Q3 - Q1
    
    # Rumus VBW
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # masking tanpa outlier
    mask = (data >= lower_bound) & (data <= upper_bound)
    
    # Return the data tanpa outlier
    return data[mask]


Load Job Dataset

In [1]:
import pandas as pd
job = pd.read_csv("job.csv", index_col=0)
job.head(10)

Unnamed: 0,Age,Accessibility,EdLevel,Employment,Gender,MentalHealth,MainBranch,YearsCode,YearsCodePro,Country,PreviousSalary,HaveWorkedWith,ComputerSkills,Employed
0,<35,No,Master,1,Man,No,Dev,7,4,Sweden,51552.0,C++;Python;Git;PostgreSQL,4,0
1,<35,No,Undergraduate,1,Man,No,Dev,12,5,Spain,46482.0,Bash/Shell;HTML/CSS;JavaScript;Node.js;SQL;Typ...,12,1
2,<35,No,Master,1,Man,No,Dev,15,6,Germany,77290.0,C;C++;Java;Perl;Ruby;Git;Ruby on Rails,7,0
3,<35,No,Undergraduate,1,Man,No,Dev,9,6,Canada,46135.0,Bash/Shell;HTML/CSS;JavaScript;PHP;Ruby;SQL;Gi...,13,0
4,>35,No,PhD,0,Man,No,NotDev,40,30,Singapore,160932.0,C++;Python,2,0
5,<35,No,Master,1,Man,No,Dev,9,2,France,38915.0,JavaScript;Python;Docker;Git;MySQL,5,0
6,>35,No,Master,1,Man,No,Dev,26,18,Germany,77831.0,C++;HTML/CSS;Java;JavaScript;Kotlin;Node.js;Ty...,17,1
7,<35,No,Master,1,Man,No,NotDev,14,5,Switzerland,81319.0,C++;Python;Docker;Git,4,0
8,>35,No,Undergraduate,1,Man,No,Dev,39,21,United Kingdom of Great Britain and Northern I...,68507.0,Python;Git;PostgreSQL,3,0
9,>35,No,Master,1,Man,No,Dev,20,16,Russian Federation,37752.0,Delphi;Java;SQL;Docker;Git;PostgreSQL,6,0


Drop some columns because it doesn't matter in our training

In [2]:
job.drop(columns=["Employment", "Gender", "MainBranch", "Country", "PreviousSalary", "HaveWorkedWith"], axis=1, inplace=True)
job.head(10)

Unnamed: 0,Age,Accessibility,EdLevel,MentalHealth,YearsCode,YearsCodePro,ComputerSkills,Employed
0,<35,No,Master,No,7,4,4,0
1,<35,No,Undergraduate,No,12,5,12,1
2,<35,No,Master,No,15,6,7,0
3,<35,No,Undergraduate,No,9,6,13,0
4,>35,No,PhD,No,40,30,2,0
5,<35,No,Master,No,9,2,5,0
6,>35,No,Master,No,26,18,17,1
7,<35,No,Master,No,14,5,4,0
8,>35,No,Undergraduate,No,39,21,3,0
9,>35,No,Master,No,20,16,6,0


Separate between feature X and feature Y

In [3]:
df_feature_x = pd.DataFrame(
    [job["Age"], job["Accessibility"], job["EdLevel"], job["MentalHealth"], job["YearsCode"], job["YearsCodePro"], 
    job["ComputerSkills"]]).transpose()
df_feature_x.head(10)

Unnamed: 0,Age,Accessibility,EdLevel,MentalHealth,YearsCode,YearsCodePro,ComputerSkills
0,<35,No,Master,No,7,4,4
1,<35,No,Undergraduate,No,12,5,12
2,<35,No,Master,No,15,6,7
3,<35,No,Undergraduate,No,9,6,13
4,>35,No,PhD,No,40,30,2
5,<35,No,Master,No,9,2,5
6,>35,No,Master,No,26,18,17
7,<35,No,Master,No,14,5,4
8,>35,No,Undergraduate,No,39,21,3
9,>35,No,Master,No,20,16,6


In [4]:
df_feature_y = pd.DataFrame(job["Employed"])
df_feature_y.head(10)

Unnamed: 0,Employed
0,0
1,1
2,0
3,0
4,0
5,0
6,1
7,0
8,0
9,0


One-hot Encode some Columns

In [5]:
df_onehot_age = pd.get_dummies(df_feature_x["Age"], prefix="Age")
df_onehot_access = pd.get_dummies(df_feature_x["Accessibility"], prefix="Accessibility")
df_onehot_edlevel = pd.get_dummies(df_feature_x["EdLevel"], prefix="EdLevel")
df_onehot_mental = pd.get_dummies(df_feature_x["MentalHealth"], prefix="MentalHealth")
df_feature_x = pd.concat([df_onehot_age, df_onehot_access, df_onehot_edlevel, df_onehot_mental, df_feature_x], axis=1)
df_feature_x.drop(columns=["Age", "Accessibility", "EdLevel", "MentalHealth"], inplace=True)
df_feature_x.head(10)

Unnamed: 0,Age_<35,Age_>35,Accessibility_No,Accessibility_Yes,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,MentalHealth_No,MentalHealth_Yes,YearsCode,YearsCodePro,ComputerSkills
0,1,0,1,0,1,0,0,0,0,1,0,7,4,4
1,1,0,1,0,0,0,0,0,1,1,0,12,5,12
2,1,0,1,0,1,0,0,0,0,1,0,15,6,7
3,1,0,1,0,0,0,0,0,1,1,0,9,6,13
4,0,1,1,0,0,0,0,1,0,1,0,40,30,2
5,1,0,1,0,1,0,0,0,0,1,0,9,2,5
6,0,1,1,0,1,0,0,0,0,1,0,26,18,17
7,1,0,1,0,1,0,0,0,0,1,0,14,5,4
8,0,1,1,0,0,0,0,0,1,1,0,39,21,3
9,0,1,1,0,1,0,0,0,0,1,0,20,16,6


Make it so when True=1, False=0 and convert it to Float so it can be use with TensorFlow

In [6]:
columns_to_float = ["Age_<35", "Age_>35", "Accessibility_No", "Accessibility_Yes", "EdLevel_Master", "EdLevel_NoHigherEd",
                   "EdLevel_Other", "EdLevel_PhD", "EdLevel_Undergraduate", "MentalHealth_No", "MentalHealth_Yes"]

for col in columns_to_float:
    df_feature_x[col] = df_feature_x[col].astype(float)

df_feature_x.head(10)

Unnamed: 0,Age_<35,Age_>35,Accessibility_No,Accessibility_Yes,EdLevel_Master,EdLevel_NoHigherEd,EdLevel_Other,EdLevel_PhD,EdLevel_Undergraduate,MentalHealth_No,MentalHealth_Yes,YearsCode,YearsCodePro,ComputerSkills
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,7,4,4
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,12,5,12
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,15,6,7
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,9,6,13
4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,40,30,2
5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9,2,5
6,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,26,18,17
7,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,14,5,4
8,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,39,21,3
9,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,20,16,6


In [7]:
df_feature_y["Employed"] = df_feature_y["Employed"].astype(float)
df_feature_y.head(10)

Unnamed: 0,Employed
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
5,0.0
6,1.0
7,0.0
8,0.0
9,0.0


In [8]:
df_feature_x.to_csv("preprocess.csv")

In [9]:
df_feature_y.to_csv("label.csv")