#Data Loading and Exploration

In [55]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('employee_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [56]:
#shape of data
df.shape

(1470, 35)

In [57]:
#setting target
target = df['Attrition']

#listing all the columns less attrition
feature = df.drop(columns=['Attrition'])

print(target)
print(feature)

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object
      Age     BusinessTravel  DailyRate              Department  \
0      41      Travel_Rarely       1102                   Sales   
1      49  Travel_Frequently        279  Research & Development   
2      37      Travel_Rarely       1373  Research & Development   
3      33  Travel_Frequently       1392  Research & Development   
4      27      Travel_Rarely        591  Research & Development   
...   ...                ...        ...                     ...   
1465   36  Travel_Frequently        884  Research & Development   
1466   39      Travel_Rarely        613  Research & Development   
1467   27      Travel_Rarely        155  Research & Development   
1468   49  Travel_Frequently       1023                   Sales   
1469   34      Travel_Rarely        628  Research & Development   

      DistanceFrom

In [9]:
df.head(30)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,...,2,80,1,1,2,3,1,0,0,0
8,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,...,2,80,0,10,2,3,9,7,1,8
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,...,2,80,2,17,3,2,7,7,7,7


#Data Preprocessing

In [58]:
#iterating through each column and replacing null values with the mode value as some columns are not numerical
for i in df.columns:
    mode = df[i].mode()
    df[i] = df[i].fillna(mode)

In [25]:
#creating a copy of the dataset to be able to manipulate post-null val handling

encoded_df = df.copy()
lb=LabelEncoder()

#executing categorical encoding
for column in encoded_df.columns:
    encoded_df[column] = lb.fit_transform(encoded_df[column])

encoded_df_one_hot = pd.get_dummies(encoded_df)
encoded_df.head(20)


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,23,1,2,624,2,0,1,1,0,0,...,0,0,0,8,0,0,6,4,0,5
1,31,0,1,113,1,7,0,1,0,1,...,3,0,1,10,3,2,10,7,1,7
2,19,1,2,805,1,1,1,4,0,2,...,1,0,0,7,3,2,0,0,0,0
3,15,0,1,820,1,2,3,1,0,3,...,2,0,0,8,3,2,8,7,3,0
4,9,0,2,312,1,1,0,3,0,4,...,3,0,1,6,3,2,2,2,2,2
5,14,0,1,572,1,1,1,1,0,5,...,2,0,0,8,2,1,7,7,3,6
6,41,0,2,770,1,2,2,3,0,6,...,0,0,3,12,3,1,1,0,0,0
7,12,0,2,794,1,23,0,1,0,7,...,1,0,1,1,2,2,1,0,0,0
8,20,0,1,78,1,22,2,1,0,8,...,1,0,0,10,2,2,9,7,1,8
9,18,0,2,751,1,26,2,3,0,9,...,1,0,2,17,3,1,7,7,7,7


In [59]:
#creating new variables for copied df
copy_features = encoded_df.drop('Attrition', axis=1)
copy_target = encoded_df['Attrition']

#set random state for consistency
feature_train, feature_test, target_train, target_test = train_test_split(copy_features, copy_target, test_size=0.2, random_state=42)

print("Feature Train", feature_train)
print('Feature Test', feature_test)

Feature Train       Age  BusinessTravel  DailyRate  Department  DistanceFromHome  Education  \
1097    6               2        159           1                20          1   
727     0               0        118           1                 4          1   
254    11               2        718           2                19          1   
1175   21               2        245           1                11          2   
1341   13               2        135           1                19          2   
...   ...             ...        ...         ...               ...        ...   
1130   17               2        418           1                27          2   
1294   23               2        216           1                 4          2   
860     4               1        725           1                 2          3   
1459   11               2        809           1                12          1   
1126   32               2        105           2                 8          2   

      Educati

#Feature Engineering

In [60]:
# from sklearn.linear_model import LinearRegression
# lm = LinearRegression()

df = pd.read_csv('employee_data.csv')

#education bins
education_binned = pd.cut(df['Education'], bins = [0,1,2,3,4,5], labels=['Below College', 'College', 'Bachelor', 'Master', 'Doctor'])

df['EducationBinned'] = education_binned

#envi satisfaction bins
env_sat_bined = pd.cut(df['EnvironmentSatisfaction'], bins = [0,1,2,3,4], labels=['Low', 'Medium', 'High', 'Very High'])

df['Envi_Satisaction'] = env_sat_bined

#job involvement bins 
job_involvement_binned = pd.cut(df['JobInvolvement'], bins = [0,1,2,3,4], labels=['Low', 'Medium', 'High', 'Very High'])

df['Job_Involvement_Bin'] = job_involvement_binned

#job satisfaction bins
job_satisfaction_binned = pd.cut(df['JobSatisfaction'], bins = [0,1,2,3,4], labels=['Low', 'Medium', 'High', 'Very High'])

df['Job_Satisfaction_Bin'] = job_satisfaction_binned

#performance bins
performance_rating_binned = pd.cut(df['PerformanceRating'], bins = [0,1,2,3,4], labels=['Low', 'Good', 'Excellent', 'Outstanding'])

df['Performance_Rating_Bin'] = performance_rating_binned

#relationship bins
relationship_satisfaction_binned = pd.cut(df['RelationshipSatisfaction'], bins = [0,1,2,3,4], labels=['Low', 'Medium', 'High', 'Very High'])

df['RelationshipSatisfactionBin'] = relationship_satisfaction_binned

#balance bins
worklife_balance = pd.cut(df['WorkLifeBalance'], bins = [0,1,2,3,4], labels=['Bad', 'Good', 'Better', 'Best'])

df['WorkLifeBalance'] = worklife_balance



##Correlations##


In [61]:

#looking to see how these columns correlate to attrition. Experimental
columns = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance']

df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

correlations = {}
for column in columns:
    correlation = df[column].corr(df['Attrition'])
    correlations[column] = correlation
    print(f'Correlation {column} and Attrition: {correlation:.4f}')


Correlation Education and Attrition: -0.0314
Correlation EnvironmentSatisfaction and Attrition: -0.1034
Correlation JobInvolvement and Attrition: -0.1300
Correlation PerformanceRating and Attrition: 0.0029
Correlation RelationshipSatisfaction and Attrition: -0.0459


ValueError: could not convert string to float: 'Bad'

#Model Training & Evaluation

In [72]:
#Started all over 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('employee_data.csv')

df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

features = ['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education','EducationField', 'EnvironmentSatisfaction', 'JobInvolvement', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

X = df[features]
y = df['Attrition']

# quantifing data
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X.loc[:, column] = le.fit_transform(X[column])
    label_encoders[column] = le

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = LogisticRegression(max_iter=1000)
reg.fit(X_train, y_train)

y_hat = reg.predict(X_test)

acc = accuracy_score(y_test, y_hat)
conf_matrix = confusion_matrix(y_test, y_hat)
class_report = classification_report(y_test, y_hat)

print("Accuracy:", acc)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

importance = reg.coef_[0]
feat_importance = pd.Series(importance, index=features).sort_values(ascending=False)
print(feat_importance)



Accuracy: 0.8809523809523809
Confusion Matrix:
 [[253   2]
 [ 33   6]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.99      0.94       255
           1       0.75      0.15      0.26        39

    accuracy                           0.88       294
   macro avg       0.82      0.57      0.60       294
weighted avg       0.87      0.88      0.85       294

YearsSinceLastPromotion     0.401172
YearsAtCompany              0.271641
DistanceFromHome            0.209005
Department                  0.191578
EducationField              0.099270
Education                   0.052781
BusinessTravel              0.012878
PerformanceRating          -0.027008
RelationshipSatisfaction   -0.068464
DailyRate                  -0.084922
Age                        -0.161053
WorkLifeBalance            -0.182278
TrainingTimesLastYear      -0.186486
EnvironmentSatisfaction    -0.244333
JobInvolvement             -0.298212
YearsWithCurrManager   