# Import libraries

In [9]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Reading Data

In [2]:
df = pd.read_csv('data_cleaned.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
Age,31,59,24,36,56
Gender,Male,Female,Female,Female,Male
Monthly Income,5390,5534,8159,3989,4821
Work-Life Balance,Excellent,Poor,Good,Good,Fair
Job Satisfaction,Medium,High,High,High,Very High
Performance Rating,Average,Low,Low,High,Average
Number of Promotions,low,high,low,low,low
Overtime,No,No,No,No,Yes
Distance from Home,22,21,11,27,71
Education Level,Associate Degree,Master’s Degree,Bachelor’s Degree,High School,High School


In [3]:
pd.options.display.float_format = '{:.01f}'.format
df.describe()

Unnamed: 0,Age,Monthly Income,Distance from Home,Attrition
count,59275.0,59275.0,59275.0,59275.0
mean,38.5,7296.8,50.0,0.5
std,12.0,2143.2,28.5,0.5
min,18.0,1316.0,1.0,0.0
25%,28.0,5656.0,25.0,0.0
50%,39.0,7352.0,50.0,0.0
75%,49.0,8878.0,75.0,1.0
max,59.0,13713.0,99.0,1.0


In [4]:
df.columns.to_list()

['Age',
 'Gender',
 'Monthly Income',
 'Work-Life Balance',
 'Job Satisfaction',
 'Performance Rating',
 'Number of Promotions',
 'Overtime',
 'Distance from Home',
 'Education Level',
 'Marital Status',
 'Number of Dependents',
 'Job Level',
 'Remote Work',
 'Company Reputation',
 'Attrition']

In [5]:
num_cols = ['Age', 'Monthly Income', 'Distance from Home']
cat_cols = list(set(df.columns.to_list()).difference(set(num_cols)). difference(set(['Attrition'])))

print('Numerical Columns:', num_cols)
print('Categorical Columns:', cat_cols)

Numerical Columns: ['Age', 'Monthly Income', 'Distance from Home']
Categorical Columns: ['Work-Life Balance', 'Overtime', 'Company Reputation', 'Remote Work', 'Education Level', 'Number of Promotions', 'Job Level', 'Job Satisfaction', 'Marital Status', 'Number of Dependents', 'Performance Rating', 'Gender']


# Data Splitting

In [6]:
pd.options.display.float_format = '{:.3f}'.format
df['Attrition'].value_counts(normalize=True)

Attrition
0   0.524
1   0.476
Name: proportion, dtype: float64

In [7]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

(47420, 15) (11855, 15)


# Preprocessing

In [14]:
scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')

In [16]:
prep = ColumnTransformer([('num', scaler, num_cols),
                          ('cat', ohe, cat_cols)
                         ]).set_output(transform='pandas')

In [18]:
X_train_prep = prep.fit_transform(X_train)
X_test_prep = prep.transform(X_test)

In [19]:
X_train_prep

Unnamed: 0,num__Age,num__Monthly Income,num__Distance from Home,cat__Work-Life Balance_Fair,cat__Work-Life Balance_Good,cat__Work-Life Balance_Poor,cat__Overtime_Yes,cat__Company Reputation_Fair,cat__Company Reputation_Good,cat__Company Reputation_Poor,...,cat__Job Satisfaction_Low,cat__Job Satisfaction_Medium,cat__Job Satisfaction_Very High,cat__Marital Status_Married,cat__Marital Status_Single,cat__Number of Dependents_low,cat__Performance Rating_Below Average,cat__Performance Rating_High,cat__Performance Rating_Low,cat__Gender_Male
38442,-1.288,2.294,-0.809,0.000,1.000,0.000,0.000,0.000,1.000,0.000,...,0.000,0.000,0.000,1.000,0.000,1.000,0.000,0.000,0.000,0.000
41479,-0.540,-0.442,0.420,0.000,1.000,0.000,1.000,0.000,1.000,0.000,...,0.000,0.000,1.000,1.000,0.000,1.000,1.000,0.000,0.000,1.000
26595,0.374,0.975,-0.282,1.000,0.000,0.000,0.000,0.000,1.000,0.000,...,0.000,0.000,1.000,1.000,0.000,1.000,0.000,0.000,0.000,1.000
26161,-1.205,0.965,0.279,0.000,0.000,1.000,0.000,0.000,0.000,1.000,...,0.000,0.000,1.000,0.000,1.000,1.000,0.000,1.000,0.000,1.000
7544,-1.454,-0.531,-0.914,1.000,0.000,0.000,0.000,1.000,0.000,0.000,...,0.000,0.000,1.000,0.000,1.000,1.000,0.000,0.000,0.000,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,0.374,0.909,-1.019,0.000,1.000,0.000,0.000,0.000,0.000,1.000,...,0.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000,0.000,0.000
38158,0.291,-1.080,-0.212,0.000,0.000,1.000,1.000,0.000,1.000,0.000,...,0.000,0.000,0.000,0.000,1.000,1.000,0.000,0.000,0.000,1.000
860,-1.537,0.593,0.490,1.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,1.000,0.000,0.000,1.000,1.000,0.000,0.000,0.000,1.000
15795,-1.039,-1.391,-1.160,0.000,0.000,1.000,0.000,0.000,0.000,0.000,...,0.000,0.000,1.000,1.000,0.000,1.000,0.000,1.000,0.000,1.000


In [21]:
train_data = pd.concat([X_train_prep, y_train], axis=1)
train_data.to_csv('train_data_prep.csv', index=False)

test_data = pd.concat([X_test_prep, y_test], axis=1)
test_data.to_csv('test_data_prep.csv', index=False)


# Save Preprocessor

In [22]:
import pickle

pickle.dump(prep, open('preprocessor.pkl', 'wb'))