# Section 2: Feature Engineering

# 1. import dependencies

In [26]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

# 2. Load the Dataset

In [27]:
# create dataframe from the csv.
df=pd.read_csv("hr_employee_churn_data.csv")
# creating a copy of the original dataframe
dataset = df.copy()
print(df.shape)
df.head()

(14999, 10)


Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,1,0.38,0.53,2,157,3,0,0,low,1
1,2,0.8,0.86,5,262,6,0,0,medium,1
2,3,0.11,0.88,7,272,4,0,0,medium,1
3,4,0.72,0.87,5,223,5,0,0,low,1
4,5,0.37,0.52,2,159,3,0,0,low,1


In [28]:
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB
None


# 3. Removing non-relevant columns

In [29]:
# we will drop empid since it not hold valuable data for predicting the 'left'
dataset.drop(['empid'],axis=1, inplace=True) 

# 4. Handeling categorical features

based on the eda (phase 1) we looked and see that the categorical features are:

    * 'left'  (actual targets) 
    * 'salary' - actual categorical
    * 'promotion_last_5years'- allready one hot encoded 0||1
    * 'Work_accident'- allready one hot encoded 0||1

Since the only real categorical feature 'salary' and it have only 3 categories we use oneHot encoding.

we will use get_dummies with flag drop_first=True to encode n categories to n-1 new features.
- this is because we always want to avoid diamentionality curse.

In [30]:
print("dataset before oneHot_encode: ",dataset.shape)

dataset = pd.get_dummies(data=dataset, drop_first=True)
print("dataset after oneHot_encode: ",dataset.shape)
dataset.head(2)

dataset before oneHot_encode:  (14999, 9)
dataset after oneHot_encode:  (14999, 10)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,1,1,0
1,0.8,0.86,5,262,6,0,0,1,0,1


# 5. Search & Resolve Nans values

- first step we will explore which data is missing. 
- second step we will fill the Nans values with the appropriate way.

In [31]:
# listing the number of missing values in each feature.(and target.)
dataset.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
left                     0
salary_low               0
salary_medium            0
dtype: int64

As we can see only satisfaction_level (which is numerical feature) contains missing data.

we will explore this feature: 

In [32]:
dataset['satisfaction_level'].describe()

count    14997.000000
mean         0.612863
std          0.248634
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: satisfaction_level, dtype: float64

## 5.1 explore different imputing stratagies:
**There are different approaches for how handling missing values:**


* univariant approach (only looking at it oun values)
    - mean
    - mode
    - median
    
* multivariant approach (taking multiple features into account)
    - knn imputer 
        1. finds the k most similar rows to the one where our missing value is. 
        2. fill the Nan value with the average value between the k similar labels.
    - iterative imputer 
         1. build regression model where X is all rows in which we dont have missing values on the targets (y)
             * we can select the regression model.
         2. evaluate the model using y which is the feature we try to predict our Nans value.
         3. replace the Nans with the predicted values.
           

In [33]:
df_temp = dataset.copy()
df_temp = pd.get_dummies(df_temp, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, dtype=None)
df_temp.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,1,1,0
1,0.8,0.86,5,262,6,0,0,1,0,1


In [34]:
df_null_list = dataset[df_temp['satisfaction_level'].isna()].index.tolist()

def get_imputed_values(imputer_table):
    global df_null_list
    imputed_values = []
    for ind in df_null_list:
        imputed_values.append(imputer_table[ind][0])
        print(f"imputed value for row {ind} is {round(imputer_table[ind][0], 4)}")
    return imputed_values

##### iterative imputer

In [35]:
# iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

impute_it = IterativeImputer()
impute_table_iter = impute_it.fit_transform(dataset)

In [36]:
imputed_values_iterative = get_imputed_values(impute_table_iter)

imputed value for row 5 is 0.4572
imputed value for row 15 is 0.465


##### knn imputer

In [37]:
# knn imputer
from sklearn.impute import KNNImputer

impute_knn=KNNImputer(n_neighbors=3)
impute_table_knn=impute_knn.fit_transform(dataset)

In [38]:
imputed_values_knn = get_imputed_values(impute_table_knn)

imputed value for row 5 is 0.4
imputed value for row 15 is 0.3767


##### mean/mode imputer

In [39]:
mean=dataset['satisfaction_level'].mean()
median=dataset['satisfaction_level'].median()
print(f'mean {mean}\nmode {median}')

mean 0.6128625725145038
mode 0.64


#### conclusion: 
we will use the knn imputer for filling our missing values in satisfaction_level feature:

In [40]:
dataset = pd.DataFrame(impute_table_knn, columns=df_temp.columns)

In [43]:
print("We Imputed the Nans values succesfully!")
dataset.isnull().sum()

We Imputed the Nans values succesfully!


satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
left                     0
salary_low               0
salary_medium            0
dtype: int64

# 6. Save the ready to train dataset

In [45]:
dataset.to_csv("dataset.csv",index=False)