### Data Preprocessing exercise

In [75]:
import pandas as pd
import numpy as np 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder , StandardScaler

In [76]:
#loading the dataset
df = pd.read_csv('employeeAttrition.csv')
df.head()

Unnamed: 0,EmpID,Age,Gender,Department,Education,MonthlyIncome,YearsAtCompany,JobSatisfaction,WorkLifeBalance,OverTime,DistanceFromHome,Attrition
0,E001,32.0,Male,Sales,Bachelor,45000.0,5.0,High,Good,Yes,10.0,No
1,E002,28.0,Female,Technology,Master,62000.0,3.0,Medium,Good,No,5.0,No
2,E003,45.0,Male,HR,Bachelor,38000.0,12.0,Low,Poor,Yes,25.0,Yes
3,E004,36.0,,Technology,PhD,85000.0,8.0,High,Good,No,8.0,No
4,E005,24.0,Female,Sales,Bachelor,32000.0,1.0,Low,Poor,Yes,,Yes


In [77]:
#Feature variable and target variable
X = df.drop(['EmpID','Attrition'], axis=1)
y = df['Attrition']

In [78]:
categorical_columns = ['Gender', 'Department', 'Education', 
                       'JobSatisfaction', 'WorkLifeBalance', 'OverTime']

numerical_columns = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'DistanceFromHome']

In [79]:
# handling the missing values
missing_data = df.isnull().sum()
print(missing_data)

impute = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X[categorical_columns] = impute.fit_transform(X[categorical_columns])

impute_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X[numerical_columns] = impute_mean.fit_transform(X[numerical_columns])


EmpID               0
Age                 2
Gender              2
Department          0
Education           1
MonthlyIncome       3
YearsAtCompany      1
JobSatisfaction     1
WorkLifeBalance     2
OverTime            0
DistanceFromHome    3
Attrition           0
dtype: int64


In [80]:
# Encoding the feature and target variables
ct = ColumnTransformer(transformers=[('encode',OneHotEncoder(),categorical_columns)],remainder='passthrough')
X = np.array(ct.fit_transform(X))

le = LabelEncoder()
y = le.fit_transform(y)

In [81]:
# Spliting the train and test sets
X_train, X_test , y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=8)


In [82]:
# Feature Scaling
st = StandardScaler()
X_train[:,-4:] = st.fit_transform(X_train[:,-4:])

X_test[:,-4:] = st.transform(X_test[:,-4:])

In [83]:
# printing the final results:
print(f'X_train set: {X_train}\n')
print(f'X_test set: {X_test}\n')
print(f'y_train set: {y_train}\n')
print(f'y_test set: {y_test}')

X_train set: [[ 0.          1.          0.          0.          1.          0.
   1.          0.          1.          0.          0.          1.
   0.          1.          0.          1.1954986   1.35345866  0.94930602
  -1.31542166]
 [ 1.          0.          0.          1.          0.          1.
   0.          0.          0.          1.          0.          0.
   1.          0.          1.         -1.33695511 -1.26697711 -1.14799798
   0.09302982]
 [ 0.          1.          1.          0.          0.          1.
   0.          0.          0.          0.          1.          0.
   1.          1.          0.          2.07635207 -0.29193124  0.02207688
   0.86127608]
 [ 0.          1.          1.          0.          0.          1.
   0.          0.          0.          1.          0.          0.
   1.          0.          1.          0.97528524 -0.90133491  1.15903642
   1.54149413]
 [ 1.          0.          0.          0.          1.          0.
   0.          1.          1.        