Employee

In [1]:
import pandas as pd
import numpy as np
import matplotlib .pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import joblib

In [3]:
data = pd.read_csv('Data/employee_promotion.csv')

In [4]:
data

Unnamed: 0,employee_id,department,performance_score,years_experience,trainings_completed,previous_promotions,attendance_rate,promoted
0,1000,Sales,68,7,10,3,0.97,0
1,1001,Operations,58,8,7,1,0.81,1
2,1002,Operations,80,10,2,0,0.79,1
3,1003,HR,76,15,7,1,0.94,1
4,1004,HR,50,1,8,0,0.85,1
...,...,...,...,...,...,...,...,...
195,1195,IT,62,14,7,2,0.74,1
196,1196,Marketing,83,10,9,2,0.78,1
197,1197,Sales,65,9,10,0,0.94,0
198,1198,IT,70,4,3,0,0.85,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   employee_id          200 non-null    int64  
 1   department           200 non-null    object 
 2   performance_score    200 non-null    int64  
 3   years_experience     200 non-null    int64  
 4   trainings_completed  200 non-null    int64  
 5   previous_promotions  200 non-null    int64  
 6   attendance_rate      200 non-null    float64
 7   promoted             200 non-null    int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 12.6+ KB


In [6]:
data.shape

(200, 8)

In [7]:
data.columns

Index(['employee_id', 'department', 'performance_score', 'years_experience',
       'trainings_completed', 'previous_promotions', 'attendance_rate',
       'promoted'],
      dtype='object')

In [8]:
data.describe()

Unnamed: 0,employee_id,performance_score,years_experience,trainings_completed,previous_promotions,attendance_rate,promoted
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,1099.5,75.305,7.87,5.19,1.425,0.79965,0.525
std,57.879185,14.67408,4.294802,3.173999,1.131715,0.117505,0.500628
min,1000.0,50.0,1.0,0.0,0.0,0.6,0.0
25%,1049.75,63.0,4.0,3.0,0.0,0.7,0.0
50%,1099.5,76.0,8.0,5.0,1.0,0.78,1.0
75%,1149.25,88.0,11.0,8.0,2.0,0.9,1.0
max,1199.0,100.0,15.0,10.0,3.0,1.0,1.0


In [9]:
data.describe(include='O').T

Unnamed: 0,count,unique,top,freq
department,200,6,Sales,40


In [10]:
data.head(10)

Unnamed: 0,employee_id,department,performance_score,years_experience,trainings_completed,previous_promotions,attendance_rate,promoted
0,1000,Sales,68,7,10,3,0.97,0
1,1001,Operations,58,8,7,1,0.81,1
2,1002,Operations,80,10,2,0,0.79,1
3,1003,HR,76,15,7,1,0.94,1
4,1004,HR,50,1,8,0,0.85,1
5,1005,Sales,70,5,5,2,0.67,0
6,1006,Operations,58,2,7,0,1.0,0
7,1007,IT,89,2,7,0,0.76,0
8,1008,Sales,55,12,6,3,0.93,0
9,1009,Marketing,88,5,9,2,0.84,1


In [11]:
data['employee_id']=range(1,len(data)+1)

In [12]:
data.head()

Unnamed: 0,employee_id,department,performance_score,years_experience,trainings_completed,previous_promotions,attendance_rate,promoted
0,1,Sales,68,7,10,3,0.97,0
1,2,Operations,58,8,7,1,0.81,1
2,3,Operations,80,10,2,0,0.79,1
3,4,HR,76,15,7,1,0.94,1
4,5,HR,50,1,8,0,0.85,1


In [13]:
X=data.drop(['employee_id','promoted'],axis=1)
y=data['promoted']

In [14]:
categorical=['department']
numeric=['performance_score','years_experience','trainings_completed','previous_promotions','attendance_rate']

In [15]:
preprocessor = ColumnTransformer(transformers=[
    ('cat',OneHotEncoder(drop='first'),categorical),
('num',StandardScaler(),numeric)
])

In [16]:
model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',
    RandomForestClassifier(n_estimators=100,random_state=42))
])

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
model.fit(X_train,y_train)

In [19]:
y_pred=model.predict(X_test)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.40      0.42      0.41        19
           1       0.45      0.43      0.44        21

    accuracy                           0.42        40
   macro avg       0.43      0.42      0.42        40
weighted avg       0.43      0.42      0.43        40



In [21]:
data['promotion_probability']=model.predict_proba(X)[:,1]

In [22]:
print(data[['employee_id','promotion_probability','promoted']])

     employee_id  promotion_probability  promoted
0              1                   0.12         0
1              2                   0.71         1
2              3                   0.76         1
3              4                   0.88         1
4              5                   0.79         1
..           ...                    ...       ...
195          196                   0.79         1
196          197                   0.73         1
197          198                   0.27         0
198          199                   0.25         0
199          200                   0.64         1

[200 rows x 3 columns]


In [23]:
promotable=data[data['promotion_probability']>=0.5]

In [24]:
data.head()

Unnamed: 0,employee_id,department,performance_score,years_experience,trainings_completed,previous_promotions,attendance_rate,promoted,promotion_probability
0,1,Sales,68,7,10,3,0.97,0,0.12
1,2,Operations,58,8,7,1,0.81,1,0.71
2,3,Operations,80,10,2,0,0.79,1,0.76
3,4,HR,76,15,7,1,0.94,1,0.88
4,5,HR,50,1,8,0,0.85,1,0.79


In [25]:
print(promotable[['employee_id','department','performance_score','years_experience','trainings_completed',
                 'previous_promotions','attendance_rate','promotion_probability']])

     employee_id  department  performance_score  years_experience  \
1              2  Operations                 58                 8   
2              3  Operations                 80                10   
3              4          HR                 76                15   
4              5          HR                 50                 1   
11            12     Finance                 52                 3   
..           ...         ...                ...               ...   
193          194          IT                 94                 9   
194          195          HR                 84                 3   
195          196          IT                 62                14   
196          197   Marketing                 83                10   
199          200          IT                 84                 3   

     trainings_completed  previous_promotions  attendance_rate  \
1                      7                    1             0.81   
2                      2               

In [26]:
data.columns

Index(['employee_id', 'department', 'performance_score', 'years_experience',
       'trainings_completed', 'previous_promotions', 'attendance_rate',
       'promoted', 'promotion_probability'],
      dtype='object')

In [27]:
print(promotable.head())

    employee_id  department  performance_score  years_experience  \
1             2  Operations                 58                 8   
2             3  Operations                 80                10   
3             4          HR                 76                15   
4             5          HR                 50                 1   
11           12     Finance                 52                 3   

    trainings_completed  previous_promotions  attendance_rate  promoted  \
1                     7                    1             0.81         1   
2                     2                    0             0.79         1   
3                     7                    1             0.94         1   
4                     8                    0             0.85         1   
11                    6                    3             0.78         1   

    promotion_probability  
1                    0.71  
2                    0.76  
3                    0.88  
4                    0.79  


In [28]:
promotable.reset_index(inplace=True)
print(promotable[['employee_id','department','promotion_probability']])

     employee_id  department  promotion_probability
0              2  Operations                   0.71
1              3  Operations                   0.76
2              4          HR                   0.88
3              5          HR                   0.79
4             12     Finance                   0.76
..           ...         ...                    ...
99           194          IT                   0.78
100          195          HR                   0.82
101          196          IT                   0.79
102          197   Marketing                   0.73
103          200          IT                   0.64

[104 rows x 3 columns]


In [29]:
data.columns.tolist()

['employee_id',
 'department',
 'performance_score',
 'years_experience',
 'trainings_completed',
 'previous_promotions',
 'attendance_rate',
 'promoted',
 'promotion_probability']

In [30]:
final_promotion = promotable[promotable['promotion_probability']<=0.75]

In [31]:
final_promotion =final_promotion.sort_values(by='promotion_probability',ascending=False)

In [32]:
final_promotion.reset_index(drop=True,inplace=True)

In [33]:
print('Final Promotion Eligible Employees')
print(final_promotion)

Final Promotion Eligible Employees
    index  employee_id  department  performance_score  years_experience  \
0      92           93   Marketing                 57                 2   
1     153          154   Marketing                 82                 8   
2     127          128     Finance                 93                 2   
3     182          183          HR                 84                 8   
4     173          174       Sales                 94                 1   
5     196          197   Marketing                 83                10   
6     169          170  Operations                 51                14   
7     116          117       Sales                 81                 9   
8     189          190   Marketing                 76                 7   
9       1            2  Operations                 58                 8   
10    122          123     Finance                 73                 5   
11     65           66       Sales                 66            

In [34]:
final_promotion.to_csv("final_promotions.csv",index=False)

In [35]:
promotable.reset_index(inplace=True)
print(promotable[['employee_id','department','promotion_probability']])

     employee_id  department  promotion_probability
0              2  Operations                   0.71
1              3  Operations                   0.76
2              4          HR                   0.88
3              5          HR                   0.79
4             12     Finance                   0.76
..           ...         ...                    ...
99           194          IT                   0.78
100          195          HR                   0.82
101          196          IT                   0.79
102          197   Marketing                   0.73
103          200          IT                   0.64

[104 rows x 3 columns]
