# 1.0 Loading Libraries

In [54]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# 2.0 Loading Dataset


In [48]:
df_raw = pd.read_csv("dataset.csv")
df_raw = df_raw.rename(columns={"Attrition": "Resignation"})
df_raw.head()

Unnamed: 0,Age,Resignation,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# 3.0 Split into train and test data

I'm splitting the dataset at this point because that way the test data will simulate the production data, that is, never seen data.

In [50]:
X = df_raw.drop('Resignation', axis = 1)
y = df_raw[['Resignation']]

In [55]:
test_size = 0.3
seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = seed)

In [56]:
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (1029, 34)
y_train: (1029, 1)
X_test: (441, 34)
y_test: (441, 1)


In [58]:
df_raw = pd.concat([X_train, y_train], axis=1)

# 4.0 Descriptive Analisys

In [35]:
dataset.shape

(1470, 35)

In [39]:
dataset.iloc[:, 0:17].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   Attrition                1470 non-null   object
 2   BusinessTravel           1470 non-null   object
 3   DailyRate                1470 non-null   int64 
 4   Department               1470 non-null   object
 5   DistanceFromHome         1470 non-null   int64 
 6   Education                1470 non-null   int64 
 7   EducationField           1470 non-null   object
 8   EmployeeCount            1470 non-null   int64 
 9   EmployeeNumber           1470 non-null   int64 
 10  EnvironmentSatisfaction  1470 non-null   int64 
 11  Gender                   1470 non-null   object
 12  HourlyRate               1470 non-null   int64 
 13  JobInvolvement           1470 non-null   int64 
 14  JobLevel                 1470 non-null  

In [40]:
dataset.iloc[:, 17:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   MaritalStatus             1470 non-null   object
 1   MonthlyIncome             1470 non-null   int64 
 2   MonthlyRate               1470 non-null   int64 
 3   NumCompaniesWorked        1470 non-null   int64 
 4   Over18                    1470 non-null   object
 5   OverTime                  1470 non-null   object
 6   PercentSalaryHike         1470 non-null   int64 
 7   PerformanceRating         1470 non-null   int64 
 8   RelationshipSatisfaction  1470 non-null   int64 
 9   StandardHours             1470 non-null   int64 
 10  StockOptionLevel          1470 non-null   int64 
 11  TotalWorkingYears         1470 non-null   int64 
 12  TrainingTimesLastYear     1470 non-null   int64 
 13  WorkLifeBalance           1470 non-null   int64 
 14  YearsAtCompany          

In [59]:
df_raw

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Resignation
714,50,Travel_Rarely,1126,Research & Development,1,2,Medical,1,997,4,...,80,1,32,1,2,5,4,1,3,No
135,36,Travel_Rarely,216,Research & Development,6,2,Medical,1,178,2,...,80,2,7,0,3,3,2,0,1,No
1271,21,Travel_Rarely,337,Sales,7,1,Marketing,1,1780,2,...,80,0,1,3,3,1,0,1,0,Yes
477,50,Travel_Frequently,1246,Human Resources,3,3,Medical,1,644,1,...,80,1,32,2,3,32,5,10,7,No
806,52,Travel_Rarely,994,Research & Development,7,4,Life Sciences,1,1118,2,...,80,0,18,4,3,8,6,4,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,35,Travel_Rarely,750,Research & Development,28,3,Life Sciences,1,1596,2,...,80,2,10,3,2,10,9,6,8,No
1294,41,Travel_Rarely,447,Research & Development,5,3,Life Sciences,1,1814,2,...,80,0,11,3,1,3,2,1,2,No
860,22,Travel_Frequently,1256,Research & Development,3,4,Life Sciences,1,1203,3,...,80,1,1,5,3,0,0,0,0,Yes
1459,29,Travel_Rarely,1378,Research & Development,13,2,Other,1,2053,4,...,80,1,10,2,3,4,3,0,3,No
