In [1]:
# set up jupyter
from IPython.display import display, Markdown # pretty display
from IPython.core.interactiveshell import InteractiveShell # multi output

InteractiveShell.ast_node_interactivity = 'all' # type: ignore

In [2]:
# import packages
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [3]:
# read raw data file
file_name = "./dataset/raw.csv"
data = pd.read_csv(file_name)
data.columns = data.columns.str.strip()
Markdown("# Raw data")
display(data.info())

# Raw data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

None

In [4]:
# binary encoding
data["Attrition"] = data["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)
data["OverTime"] = data["OverTime"].apply(lambda x: 1 if x == "Yes" else 0)

# one hot encoding
experience = pd.get_dummies(data["JobLevel"], prefix="JobLevel", dtype="int")
data = data.drop("JobLevel", axis=1)
data = pd.concat([data, experience], axis=1)
display(data)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,JobLevel_1,JobLevel_2,JobLevel_3,JobLevel_4,JobLevel_5
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,6,4,0,5,0,1,0,0,0
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,3,10,7,1,7,0,1,0,0,0
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,3,0,0,0,0,1,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,8,7,3,0,1,0,0,0,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,3,2,2,2,2,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,5,2,0,3,0,1,0,0,0
1466,39,0,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,3,7,7,1,7,0,0,1,0,0
1467,27,0,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,3,6,2,0,3,0,1,0,0,0
1468,49,0,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,2,9,6,0,8,0,1,0,0,0


In [5]:
# select columns of interest
data = data[[
    "Age",
    "MonthlyIncome",
    "OverTime",
    "DistanceFromHome",
    "JobLevel_1",
    "JobLevel_2",
    "JobLevel_3",
    "JobLevel_4",
    "JobLevel_5",
    "Attrition"
]]

Markdown("# Feature selection")
display(data.columns)

# Feature selection

Index(['Age', 'MonthlyIncome', 'OverTime', 'DistanceFromHome', 'JobLevel_1',
       'JobLevel_2', 'JobLevel_3', 'JobLevel_4', 'JobLevel_5', 'Attrition'],
      dtype='object')

In [6]:
# normalize values
data = pd.DataFrame(
    MinMaxScaler(feature_range=(0,1))
    .fit_transform(data),
    columns = data.columns
)
Markdown("# Data normalization")
display(data)

# Data normalization

Unnamed: 0,Age,MonthlyIncome,OverTime,DistanceFromHome,JobLevel_1,JobLevel_2,JobLevel_3,JobLevel_4,JobLevel_5,Attrition
0,0.547619,0.262454,1.0,0.000000,0.0,1.0,0.0,0.0,0.0,1.0
1,0.738095,0.217009,0.0,0.250000,0.0,1.0,0.0,0.0,0.0,0.0
2,0.452381,0.056925,1.0,0.035714,1.0,0.0,0.0,0.0,0.0,1.0
3,0.357143,0.100053,1.0,0.071429,1.0,0.0,0.0,0.0,0.0,0.0
4,0.214286,0.129489,0.0,0.035714,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1465,0.428571,0.082254,0.0,0.785714,0.0,1.0,0.0,0.0,0.0,0.0
1466,0.500000,0.472986,0.0,0.178571,0.0,0.0,1.0,0.0,0.0,0.0
1467,0.214286,0.270300,1.0,0.107143,0.0,1.0,0.0,0.0,0.0,0.0
1468,0.738095,0.230700,0.0,0.035714,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
# save to file
file_name = "./dataset/preprocessed.csv"
data.to_csv(file_name, index=False)
Markdown(f"Saved to file `{file_name}`")

Saved to file `./dataset/preprocessed.csv`