### 1. Attrition Prediction Project

In [521]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [522]:
import os
import kagglehub
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame
from typing import  Optional
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import feature_engine.selection  as fs
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from scipy.stats import chi2_contingency


#### 2. Data Preparation

In [523]:
def load_data(file_path: str,sep:str=None) -> Optional[DataFrame]:
    '''
    Load the dataset from a local file
    Args:
        - file_path: str: path to the file
        - sep: str: separator used in the file
    Returns:
        - Optional[DataFrame]: A pandas Dataframe contains the loaded dataset
    '''
    try:
        df = pd.read_csv(file_path,sep=sep)
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None 

In [524]:
data = load_data('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv',sep=',')

In [525]:
data.head().T

Unnamed: 0,0,1,2,3,4
Age,41,49,37,33,27
Attrition,Yes,No,Yes,No,No
BusinessTravel,Travel_Rarely,Travel_Frequently,Travel_Rarely,Travel_Frequently,Travel_Rarely
DailyRate,1102,279,1373,1392,591
Department,Sales,Research & Development,Research & Development,Research & Development,Research & Development
DistanceFromHome,1,8,2,3,2
Education,2,1,2,4,1
EducationField,Life Sciences,Life Sciences,Other,Life Sciences,Medical
EmployeeCount,1,1,1,1,1
EmployeeNumber,1,2,4,5,7


In [526]:
data.shape

(1470, 35)

In [527]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [528]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [529]:
def standardizing_column_names(data: DataFrame) -> DataFrame:
  ''' standardize columns names to lower case and replace spaces with underscores
      Args:
          data: DataFrame
      Returns:
          DataFrame
  
  '''
  data.columns = data.columns.str.lower().str.replace(' ', '_')
  return data

In [530]:
# test the function
data = standardizing_column_names(data)
data.columns

Index(['age', 'attrition', 'businesstravel', 'dailyrate', 'department',
       'distancefromhome', 'education', 'educationfield', 'employeecount',
       'employeenumber', 'environmentsatisfaction', 'gender', 'hourlyrate',
       'jobinvolvement', 'joblevel', 'jobrole', 'jobsatisfaction',
       'maritalstatus', 'monthlyincome', 'monthlyrate', 'numcompaniesworked',
       'over18', 'overtime', 'percentsalaryhike', 'performancerating',
       'relationshipsatisfaction', 'standardhours', 'stockoptionlevel',
       'totalworkingyears', 'trainingtimeslastyear', 'worklifebalance',
       'yearsatcompany', 'yearsincurrentrole', 'yearssincelastpromotion',
       'yearswithcurrmanager'],
      dtype='object')

In [531]:
def rename_observations(data: DataFrame) -> DataFrame:
    ''' 
    This function converts all string values in categorical columns 
    lowercase and replaces spaces with underscores in a DataFrame
    Args:
        data: DataFrame
    Returns:
        DataFrame
    '''
    categorical_columns = data.select_dtypes(include=['object']).columns

    for col in categorical_columns:
        data[col] = data[col].str.lower().str.strip().str.replace(' ', '_')
    return data

In [532]:
#test the function
data = rename_observations(data)
data.head().T

Unnamed: 0,0,1,2,3,4
age,41,49,37,33,27
attrition,yes,no,yes,no,no
businesstravel,travel_rarely,travel_frequently,travel_rarely,travel_frequently,travel_rarely
dailyrate,1102,279,1373,1392,591
department,sales,research_&_development,research_&_development,research_&_development,research_&_development
distancefromhome,1,8,2,3,2
education,2,1,2,4,1
educationfield,life_sciences,life_sciences,other,life_sciences,medical
employeecount,1,1,1,1,1
employeenumber,1,2,4,5,7


In [533]:
# check for missing values
data.isna().sum()

age                         0
attrition                   0
businesstravel              0
dailyrate                   0
department                  0
distancefromhome            0
education                   0
educationfield              0
employeecount               0
employeenumber              0
environmentsatisfaction     0
gender                      0
hourlyrate                  0
jobinvolvement              0
joblevel                    0
jobrole                     0
jobsatisfaction             0
maritalstatus               0
monthlyincome               0
monthlyrate                 0
numcompaniesworked          0
over18                      0
overtime                    0
percentsalaryhike           0
performancerating           0
relationshipsatisfaction    0
standardhours               0
stockoptionlevel            0
totalworkingyears           0
trainingtimeslastyear       0
worklifebalance             0
yearsatcompany              0
yearsincurrentrole          0
yearssince

In [534]:
# check for duplicated rows
data.duplicated().sum()

0

### Removing redundant features 

In [535]:
# select categorical  columns( Nominal and Ordinal)
categorical_features = data.select_dtypes(include=['object'])
categorical_features.head()

Unnamed: 0,attrition,businesstravel,department,educationfield,gender,jobrole,maritalstatus,over18,overtime
0,yes,travel_rarely,sales,life_sciences,female,sales_executive,single,y,yes
1,no,travel_frequently,research_&_development,life_sciences,male,research_scientist,married,y,no
2,yes,travel_rarely,research_&_development,other,male,laboratory_technician,single,y,yes
3,no,travel_frequently,research_&_development,life_sciences,female,research_scientist,married,y,yes
4,no,travel_rarely,research_&_development,medical,male,laboratory_technician,married,y,no


In [536]:
categorical_features.nunique()

attrition         2
businesstravel    3
department        3
educationfield    6
gender            2
jobrole           9
maritalstatus     3
over18            1
overtime          2
dtype: int64

In [553]:
def remove_high_cardinality_columns(data:DataFrame, threshold:int=10):
    """
    Remove high-cardinality categorical columns from a DataFrame.

    Args:
        - data: The DataFrame containing categorical features.
        - threshold (int): The maximum number of unique values a column can have to avoid being removed.

    Returns:
          - pd.DataFrame: A DataFrame with high-cardinality columns removed.
          - list: The list of columns removed.
    """
    high_cardinality = [col for col in data.columns if data[col].nunique() > threshold]
    #df_reduced = data.drop(columns=high_cardinality)
    
    #print(f"Removed {len(high_cardinality)} high-cardinality columns: {high_cardinality}")
    return high_cardinality

In [554]:
# test the function
cardinal_columns = remove_high_cardinality_columns(categorical_features, threshold=10)
cardinal_columns

[]

In [555]:
categorical_features.head()

Unnamed: 0,attrition,businesstravel,department,educationfield,gender,jobrole,maritalstatus,over18,overtime
0,yes,travel_rarely,sales,life_sciences,female,sales_executive,single,y,yes
1,no,travel_frequently,research_&_development,life_sciences,male,research_scientist,married,y,no
2,yes,travel_rarely,research_&_development,other,male,laboratory_technician,single,y,yes
3,no,travel_frequently,research_&_development,life_sciences,female,research_scientist,married,y,yes
4,no,travel_rarely,research_&_development,medical,male,laboratory_technician,married,y,no


In [556]:

def remove_low_variance_columns(data:DataFrame, threshold:float=0.95):
    """
    Removes low-variance categorical columns from a DataFrame.
    
    Args:
        - data: The input DataFrame.
        - threshold (float): The value above which a feature is considered low variance.

    Returns:
    - pd.DataFrame: The reduced DataFrame with low-variance columns removed.
    """
    if isinstance(data, pd.DataFrame):
        # Identify low-variance columns
        low_variance = [col for col in data.columns if data[col].value_counts(normalize=True).iloc[0] > threshold]
        print(f"Removed {len(low_variance)} low-variance columns: {low_variance}")
        
        # Drop the identified columns
        #df_reduced = data.drop(columns=low_variance)

        return low_variance
    else:
        raise TypeError("Input data must be a pandas DataFrame")

In [None]:
isinstance(data, pd.DataFrame)

True

In [557]:
# test the function
low_variance_col = remove_low_variance_columns(categorical_features, threshold=0.95)


Removed 1 low-variance columns: ['over18']


In [558]:
categorical_features = categorical_features.drop(columns=low_variance_col)

##### remove the over18 columns 

In [559]:
categorical_features.head()

Unnamed: 0,attrition,businesstravel,department,educationfield,gender,jobrole,maritalstatus,overtime
0,yes,travel_rarely,sales,life_sciences,female,sales_executive,single,yes
1,no,travel_frequently,research_&_development,life_sciences,male,research_scientist,married,no
2,yes,travel_rarely,research_&_development,other,male,laboratory_technician,single,yes
3,no,travel_frequently,research_&_development,life_sciences,female,research_scientist,married,yes
4,no,travel_rarely,research_&_development,medical,male,laboratory_technician,married,no


In [None]:
def find_highly_correlated_categorical(data:DataFrame, threshold:float=0.8):
    """
    Identifies highly correlated categorical features in a DataFrame using Cramér's V.

    Parameters:
    - data: The input DataFrame.
    - threshold (float): The Cramér's V threshold above which columns are considered highly correlated.

    Returns:
    - list: A list of tuples containing pairs of highly correlated columns and their Cramér's V value.
    """
    # Function to calculate Cramér's V
    def cramers_v(x, y):
        confusion_matrix = pd.crosstab(x, y)
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum().sum()
        return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

    categorical_cols = data.select_dtypes(include="object").columns
    redundant_pairs = []

    for col1 in categorical_cols:
        for col2 in categorical_cols:
            if col1 != col2:
                cramer_value = cramers_v(data[col1], data[col2])
                if cramer_value > threshold:  # High correlation threshold
                    redundant_pairs.append((col1, col2, cramer_value))
    print("\nHighly Correlated Categorical Features:")
    for pair in redundant_pairs:
        print(f"{pair[0]} and {pair[1]}: Cramér's V = {pair[2]:.2f}")
    
    return redundant_pairs

In [560]:
high_corr_cate = find_highly_correlated_categorical(categorical_features, threshold=0.8)


Highly Correlated Categorical Features:
department and jobrole: Cramér's V = 0.94
jobrole and department: Cramér's V = 0.94


In [561]:
# Drop one of the highly correlated columns
categorical_features.drop(columns=['jobrole'], inplace=True)


In [562]:
categorical_features.head()

Unnamed: 0,attrition,businesstravel,department,educationfield,gender,maritalstatus,overtime
0,yes,travel_rarely,sales,life_sciences,female,single,yes
1,no,travel_frequently,research_&_development,life_sciences,male,married,no
2,yes,travel_rarely,research_&_development,other,male,single,yes
3,no,travel_frequently,research_&_development,life_sciences,female,married,yes
4,no,travel_rarely,research_&_development,medical,male,married,no


### Data labeling with one-hot and labelencoder for nominal and ordinal categorical data respectively.

In [570]:
def label_encode_categorical(data:DataFrame):
    """
    Encode categorical columns with numerical values using Label Encoding.
    
    Args:
        - data: The input DataFrame.
    
    Returns:
        - pd.DataFrame: The DataFrame with categorical columns encoded as numerical values.
    """
    label_enc = LabelEncoder()
    for col in data.select_dtypes(include='object').columns:
        data[col] = label_enc.fit_transform(data[col])
    return data

In [571]:
categorical_features = label_encode_categorical(categorical_features)

In [575]:
df_categorical = categorical_features.copy()
df_categorical.head()

Unnamed: 0,attrition,businesstravel,department,educationfield,gender,maritalstatus,overtime
0,1,2,2,1,0,2,1
1,0,1,1,1,1,1,0
2,1,2,1,4,1,2,1
3,0,1,1,1,0,1,1
4,0,2,1,3,1,1,0


In [576]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   attrition       1470 non-null   int64
 1   businesstravel  1470 non-null   int64
 2   department      1470 non-null   int64
 3   educationfield  1470 non-null   int64
 4   gender          1470 non-null   int64
 5   maritalstatus   1470 non-null   int64
 6   overtime        1470 non-null   int64
dtypes: int64(7)
memory usage: 80.5 KB


In [582]:
df_categorical.to_csv('../data/categorical_data.csv', index=False)


In [583]:
numerical_features = data.select_dtypes(include=[np.number])
numerical_features.head()

Unnamed: 0,age,dailyrate,distancefromhome,education,employeecount,employeenumber,environmentsatisfaction,hourlyrate,jobinvolvement,joblevel,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2


In [584]:
len(numerical_features.columns)

26

In [585]:
numerical_features.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
dailyrate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
distancefromhome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
employeecount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
employeenumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
environmentsatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
hourlyrate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
jobinvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
joblevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


In [586]:
corr_matrix = numerical_features.corr()
corr_matrix

Unnamed: 0,age,dailyrate,distancefromhome,education,employeecount,employeenumber,environmentsatisfaction,hourlyrate,jobinvolvement,joblevel,...,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
age,1.0,0.010661,-0.001686,0.208034,,-0.010145,0.010146,0.024287,0.02982,0.509604,...,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
dailyrate,0.010661,1.0,-0.004985,-0.016806,,-0.05099,0.018355,0.023381,0.046135,0.002966,...,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
distancefromhome,-0.001686,-0.004985,1.0,0.021042,,0.032916,-0.016075,0.031131,0.008783,0.005303,...,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
education,0.208034,-0.016806,0.021042,1.0,,0.04207,-0.027128,0.016775,0.042438,0.101589,...,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
employeecount,,,,,,,,,,,...,,,,,,,,,,
employeenumber,-0.010145,-0.05099,0.032916,0.04207,,1.0,0.017621,0.035179,-0.006888,-0.018519,...,-0.069861,,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
environmentsatisfaction,0.010146,0.018355,-0.016075,-0.027128,,0.017621,1.0,-0.049857,-0.008278,0.001212,...,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
hourlyrate,0.024287,0.023381,0.031131,0.016775,,0.035179,-0.049857,1.0,0.042861,-0.027853,...,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
jobinvolvement,0.02982,0.046135,0.008783,0.042438,,-0.006888,-0.008278,0.042861,1.0,-0.01263,...,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
joblevel,0.509604,0.002966,0.005303,0.101589,,-0.018519,0.001212,-0.027853,-0.01263,1.0,...,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281


In [593]:

def remove_highly_correlated_features(data:DataFrame, threshold:float=0.8):
    """
    Removes highly correlated features from a numerical DataFrame.
    Args:
        data (DataFrame): The input numerical DataFrame.
        threshold (float): The correlation threshold above which features are dropped.

    Returns:
        pd.DataFrame: The reduced DataFrame with less correlated features.
        list: A list of the features that were dropped.
    """
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input data must be a pandas DataFrame.")

    # Compute the correlation matrix
    corr_matrix = data.corr()

    # Upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Identify features to drop
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column].abs() > threshold)]

    # Drop highly correlated features
    #reduced_data = data.drop(columns=to_drop)

    return to_drop

In [594]:
high_corr_num = remove_highly_correlated_features(numerical_features, threshold=0.8)
high_corr_num

['monthlyincome']

In [595]:
numerical_features = numerical_features.drop(columns=high_corr_num)

In [602]:
# test the function
low_variance_num_col = remove_low_variance_columns(numerical_features, threshold=0.95)


Removed 2 low-variance columns: ['employeecount', 'standardhours']


In [603]:
numerical_features = numerical_features.drop(columns=low_variance_num_col)

In [604]:
numerical_features.to_csv('../data/numerical_data.csv', index=False)

In [605]:
numerical_features.head()

Unnamed: 0,age,dailyrate,distancefromhome,education,employeenumber,environmentsatisfaction,hourlyrate,jobinvolvement,joblevel,jobsatisfaction,...,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager
0,41,1102,1,2,1,2,94,3,2,4,...,3,1,0,8,0,1,6,4,0,5
1,49,279,8,1,2,3,61,2,2,2,...,4,4,1,10,3,3,10,7,1,7
2,37,1373,2,2,4,4,92,2,1,3,...,3,2,0,7,3,3,0,0,0,0
3,33,1392,3,4,5,4,56,3,1,3,...,3,3,0,8,3,3,8,7,3,0
4,27,591,2,1,7,1,40,3,1,2,...,3,4,1,6,3,3,2,2,2,2
