In [1]:
import pandas as pd
import numpy as np

In [2]:
df= pd.read_csv("train.csv")
df.head(2)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60,0


In [3]:
df= df.drop(["employee_id","recruitment_channel","region"],axis=1)

In [5]:
df.shape

(54808, 10)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   education             52399 non-null  object 
 2   gender                54808 non-null  object 
 3   no_of_trainings       54808 non-null  int64  
 4   age                   54808 non-null  int64  
 5   previous_year_rating  50684 non-null  float64
 6   length_of_service     54808 non-null  int64  
 7   awards_won?           54808 non-null  int64  
 8   avg_training_score    54808 non-null  int64  
 9   is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 4.2+ MB


## Dealing with Missing values


In [7]:
df.isna().sum()

department                 0
education               2409
gender                     0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [18]:
df['education']=df['education'].fillna("Bachelor's")

In [28]:
df['previous_year_rating'].median() 

3.0

In [30]:
df['previous_year_rating']= df['previous_year_rating'].fillna(3)

In [31]:
df.isna().sum()

department              0
education               0
gender                  0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

## Segregation of Data

In [33]:
df.describe()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,1.253011,34.803915,3.304481,5.865512,0.023172,63.38675,0.08517
std,0.609264,7.660169,1.21477,4.265094,0.15045,13.371559,0.279137
min,1.0,20.0,1.0,1.0,0.0,39.0,0.0
25%,1.0,29.0,3.0,3.0,0.0,51.0,0.0
50%,1.0,33.0,3.0,5.0,0.0,60.0,0.0
75%,1.0,39.0,4.0,7.0,0.0,76.0,0.0
max,10.0,60.0,5.0,37.0,1.0,99.0,1.0


In [38]:
X= df.drop("is_promoted", axis=1)
Y= df['is_promoted']


In [40]:
#Train test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X,Y,test_size=0.2, random_state=0)
x_train.shape

(43846, 9)

In [41]:
x_train.columns

Index(['department', 'education', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'awards_won?',
       'avg_training_score'],
      dtype='object')

In [44]:
x_train.dtypes

department               object
education                object
gender                   object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
awards_won?               int64
avg_training_score        int64
dtype: object

## Transforming columns (Data Preprocessing)

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

cf= ColumnTransformer([('trf',OneHotEncoder(sparse=False,drop='first'), 
['department',"education", "gender"])],remainder='passthrough')

In [79]:
from sklearn.neighbors import KNeighborsClassifier #best
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier  #best
from sklearn.ensemble import  GradientBoostingClassifier 

from sklearn.pipeline import Pipeline

In [80]:
pipe= Pipeline(steps=[
    ('step1',cf),
    ('step2', GradientBoostingClassifier())
])

In [81]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('step1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('trf',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['department', 'education',
                                                   'gender'])])),
                ('step2', GradientBoostingClassifier())])

In [82]:
y_pred= pipe.predict(x_test)

In [95]:
from sklearn.metrics import accuracy_score
print("Accuracy: ",accuracy_score(y_pred, y_test)*100,"%")

Accuracy:  94.21638387155629 %


In [96]:
from sklearn.metrics import precision_score

# Assuming you have true labels (y_true) and predicted labels (y_pred)
precision = precision_score( y_test, y_pred)
print("Precision:", precision)


Precision: 0.9704918032786886


In [149]:
import pickle
pickle.dump(pipe, open("classifier.pkl", 'wb'))

In [148]:
pipe.predict(pd.DataFrame({"department":['Procurement'],"education":["Bachelor's"],"gender":['m'],"no_of_trainings":[1],
             "age":[25],"previous_year_rating":[4.6],"length_of_service":[3],"awards_won?":[0],"avg_training_score":[89]}))

array([1], dtype=int64)

In [97]:
x_train.columns

Index(['department', 'education', 'gender', 'no_of_trainings', 'age',
       'previous_year_rating', 'length_of_service', 'awards_won?',
       'avg_training_score'],
      dtype='object')

In [114]:
df[df['is_promoted']==1].tail(10)

Unnamed: 0,department,education,gender,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
54695,Operations,Bachelor's,f,2,52,5.0,18,0,56,1
54713,Procurement,Bachelor's,m,1,30,4.0,4,0,67,1
54720,Analytics,Bachelor's,m,1,29,2.0,3,0,88,1
54722,Procurement,Master's & above,m,1,34,5.0,2,0,72,1
54730,Sales & Marketing,Bachelor's,m,1,29,5.0,4,0,58,1
54734,Operations,Bachelor's,m,1,31,3.0,1,0,56,1
54757,Technology,Master's & above,m,1,54,4.0,7,0,81,1
54761,Procurement,Bachelor's,f,1,30,4.0,2,0,86,1
54792,Sales & Marketing,Bachelor's,m,1,59,3.0,11,0,65,1
54796,Sales & Marketing,Master's & above,m,1,34,3.0,7,0,60,1
