# Data Split and Sampling Methods

## Data Split

In [3]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
# Import data
df_normalized = pd.read_csv(r'..\data\interim\normalized_data.csv')

column_to_move = df_normalized.pop("Attrition")
# insert column with insert(location, column_name, column_value)

df_normalized.insert(len(df_normalized.columns), "Attrition", column_to_move)
print(df_normalized.shape)

(1470, 56)


In [5]:
# Compute correlation of all columns with 'Attrition' and sort
correlation_with_attrition = df_normalized.corr()["Attrition"].drop("Attrition").abs().sort_values(ascending=False)

# Print sorted correlations
correlation_with_attrition

OverTime                             0.246118
MaritalStatus_Single                 0.175419
TotalWorkingYears                    0.171063
JobLevel                             0.169105
DepartmentSalaryRatio                0.165432
YearsInCurrentRole                   0.160545
MonthlyIncome                        0.159840
Age                                  0.159205
EducationReturnRatio                 0.157245
JobRole_Sales Representative         0.157234
YearsWithCurrManager                 0.156199
StockOptionLevel                     0.137145
YearsAtCompany                       0.134392
JobInvolvement                       0.130016
BusinessTravel_Travel_Frequently     0.115143
JobSatisfaction                      0.103481
EnvironmentSatisfaction              0.103369
JobRole_Laboratory Technician        0.098290
MaritalStatus_Married                0.090984
JobRole_Research Director            0.088870
MaritalStatus_Divorced               0.087716
Department_Research & Development 

In [6]:
top_30_features = correlation_with_attrition.nlargest(30).index.tolist()

top_30_features

['OverTime',
 'MaritalStatus_Single',
 'TotalWorkingYears',
 'JobLevel',
 'DepartmentSalaryRatio',
 'YearsInCurrentRole',
 'MonthlyIncome',
 'Age',
 'EducationReturnRatio',
 'JobRole_Sales Representative',
 'YearsWithCurrManager',
 'StockOptionLevel',
 'YearsAtCompany',
 'JobInvolvement',
 'BusinessTravel_Travel_Frequently',
 'JobSatisfaction',
 'EnvironmentSatisfaction',
 'JobRole_Laboratory Technician',
 'MaritalStatus_Married',
 'JobRole_Research Director',
 'MaritalStatus_Divorced',
 'Department_Research & Development',
 'JobRole_Manager',
 'ManagementStabilityRatio',
 'JobRole_Manufacturing Director',
 'Department_Sales',
 'JobRole_Healthcare Representative',
 'DistanceFromHome',
 'PositionStagnationRatio',
 'BusinessTravel_Non-Travel']

In [7]:
df_fs = pd.concat([df_normalized['Attrition'], df_normalized[top_30_features]], axis=1)

df_fs

Unnamed: 0,Attrition,OverTime,MaritalStatus_Single,TotalWorkingYears,JobLevel,DepartmentSalaryRatio,YearsInCurrentRole,MonthlyIncome,Age,EducationReturnRatio,...,MaritalStatus_Divorced,Department_Research & Development,JobRole_Manager,ManagementStabilityRatio,JobRole_Manufacturing Director,Department_Sales,JobRole_Healthcare Representative,DistanceFromHome,PositionStagnationRatio,BusinessTravel_Non-Travel
0,1,1,1,-0.421499,2,0.861166,-0.063274,-0.108313,0.446199,0.962477,...,0,0,0,0.833333,0,1,0,-1.010565,0.000000,0
1,0,0,0,-0.164455,2,0.816716,0.764737,-0.291619,1.321915,0.909482,...,0,1,0,0.700000,0,0,0,-0.147100,0.100000,0
2,1,1,1,-0.550021,1,0.332736,-1.167290,-0.937335,0.008340,0.335654,...,0,1,0,1.000000,0,0,0,-0.887213,1.000000,0
3,0,1,0,-0.421499,1,0.463124,0.764737,-0.763374,-0.429518,0.425765,...,0,1,0,0.000000,0,0,0,-0.763861,0.375000,0
4,0,0,0,-0.678543,1,0.552119,-0.615282,-0.644639,-1.086306,0.614831,...,0,1,0,1.000000,0,0,0,-0.887213,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,0,0,0.735197,2,0.409313,-0.615282,-0.835167,-0.101124,0.412903,...,0,1,0,0.600000,0,0,0,1.703184,0.000000,0
1466,0,0,0,-0.292977,3,1.590606,0.764737,0.740888,0.227269,1.771275,...,0,1,0,1.000000,0,0,1,-0.393804,0.142857,0
1467,0,1,0,-0.678543,2,0.977830,-0.615282,-0.076664,-1.086306,0.942420,...,0,1,0,0.500000,1,0,0,-0.640509,0.000000,0
1468,0,0,0,0.735197,2,0.774517,0.488733,-0.236394,1.321915,0.827034,...,0,0,0,0.888889,0,1,0,-0.887213,0.000000,0


In [8]:
train_set, test_set = train_test_split(df_fs, random_state=123, test_size = 0.3)
print(train_set.shape)
print(test_set.shape)

(1029, 31)
(441, 31)


In [9]:
train_set.to_csv(r'..\data\processed\train_set_fs.csv', index=False)
test_set.to_csv(r'..\data\processed\test_set_fs.csv', index=False)