In [25]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [26]:
df = pd.read_csv('Employee.csv')
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [27]:
df.shape

(4653, 9)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [29]:
df.isnull().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [30]:
df.duplicated().sum()

1889

In [31]:
df.drop_duplicates(inplace = True)

In [32]:
df.shape

(2764, 9)

In [33]:
df.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,2764.0,2764.0,2764.0,2764.0,2764.0
mean,2015.090449,2.636035,30.952967,2.644356,0.393632
std,1.885943,0.624001,5.108872,1.61061,0.488643
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,2.0,27.0,1.0,0.0
50%,2015.0,3.0,30.0,2.0,0.0
75%,2017.0,3.0,35.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


In [34]:
data = df.copy()

In [35]:
numerical_columns, categorical_columns = [], []
for column in data.columns:
    if data[column].dtype == 'object':
        categorical_columns.append(column)
    else:
        numerical_columns.append(column)

In [36]:
for column in categorical_columns:
    print(f'{column} : {data[column].value_counts()}')
    print('---------------------------------------------')

Education : Education
Bachelors    1971
Masters       637
PHD           156
Name: count, dtype: int64
---------------------------------------------
City : City
Bangalore    1171
Pune          801
New Delhi     792
Name: count, dtype: int64
---------------------------------------------
Gender : Gender
Male      1529
Female    1235
Name: count, dtype: int64
---------------------------------------------
EverBenched : EverBenched
No     2403
Yes     361
Name: count, dtype: int64
---------------------------------------------


In [37]:
data = pd.get_dummies(data = data, columns = categorical_columns, drop_first = True, dtype = 'int')

In [38]:
data.head()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot,Education_Masters,Education_PHD,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes
0,2017,3,34,0,0,0,0,0,0,1,0
1,2013,1,28,3,1,0,0,0,1,0,0
2,2014,3,38,2,0,0,0,1,0,0,0
3,2016,3,27,5,1,1,0,0,0,1,0
4,2017,3,24,2,1,1,0,0,1,1,1


In [39]:
data.shape

(2764, 11)

In [40]:
data.columns

Index(['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain',
       'LeaveOrNot', 'Education_Masters', 'Education_PHD', 'City_New Delhi',
       'City_Pune', 'Gender_Male', 'EverBenched_Yes'],
      dtype='object')

In [41]:
data['LeaveOrNot'].value_counts()

LeaveOrNot
0    1676
1    1088
Name: count, dtype: int64

In [42]:
X = data.drop(columns = ['LeaveOrNot'], axis = 1)
y = data['LeaveOrNot']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101, stratify = y, shuffle = True)

In [44]:
x_s = np.ones(X_train.shape[0])

In [45]:
model = sm.Logit(y_train, x_s).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.670282
         Iterations 4


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2210.0
Method:,MLE,Df Model:,0.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,1.72e-11
Time:,01:47:55,Log-Likelihood:,-1482.0
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.4327,0.044,-9.939,0.000,-0.518,-0.347


In [46]:
x1 = X_train[['JoiningYear']]
x1_1 = sm.add_constant(x1)
model1 = sm.Logit(y_train, x1_1).fit()
model1.summary()

Optimization terminated successfully.
         Current function value: 0.659206
         Iterations 6


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2209.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.01652
Time:,01:47:55,Log-Likelihood:,-1457.5
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,2.589e-12

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-328.5572,47.458,-6.923,0.000,-421.573,-235.541
JoiningYear,0.1628,0.024,6.914,0.000,0.117,0.209


In [47]:
x2 = X_train[['JoiningYear', 'PaymentTier']]
x2_1 = sm.add_constant(x2)
model2 = sm.Logit(y_train, x2_1).fit()
model2.summary()

Optimization terminated successfully.
         Current function value: 0.652810
         Iterations 6


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2208.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.02607
Time:,01:47:55,Log-Likelihood:,-1443.4
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,1.668e-17

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-316.7138,47.800,-6.626,0.000,-410.400,-223.028
JoiningYear,0.1574,0.024,6.638,0.000,0.111,0.204
PaymentTier,-0.3702,0.070,-5.323,0.000,-0.506,-0.234


In [48]:
x3 = X_train[['JoiningYear', 'PaymentTier', 'Age']]
x3_1 = sm.add_constant(x3)
model3 = sm.Logit(y_train, x3_1).fit()
model3.summary()

Optimization terminated successfully.
         Current function value: 0.646499
         Iterations 6


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2207.0
Method:,MLE,Df Model:,3.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03548
Time:,01:47:56,Log-Likelihood:,-1429.4
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,1.202e-22

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-327.6193,48.180,-6.800,0.000,-422.049,-233.189
JoiningYear,0.1635,0.024,6.839,0.000,0.117,0.210
PaymentTier,-0.3521,0.070,-5.024,0.000,-0.489,-0.215
Age,-0.0467,0.009,-5.232,0.000,-0.064,-0.029


In [49]:
x4 = X_train[['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']]
x4_1 = sm.add_constant(x4)
model4 = sm.Logit(y_train, x4_1).fit()
model4.summary()

Optimization terminated successfully.
         Current function value: 0.646295
         Iterations 6


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2206.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.03579
Time:,01:50:52,Log-Likelihood:,-1429.0
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,5.001e-22

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-326.2162,48.207,-6.767,0.000,-420.700,-231.733
JoiningYear,0.1629,0.024,6.808,0.000,0.116,0.210
PaymentTier,-0.3513,0.070,-5.012,0.000,-0.489,-0.214
Age,-0.0474,0.009,-5.284,0.000,-0.065,-0.030
ExperienceInCurrentDomain,-0.0264,0.028,-0.951,0.342,-0.081,0.028


In [50]:
x5 = sm.add_constant(X_train)
model5 = sm.Logit(y_train, x5).fit()
model5.summary()

Optimization terminated successfully.
         Current function value: 0.610791
         Iterations 6


0,1,2,3
Dep. Variable:,LeaveOrNot,No. Observations:,2211.0
Model:,Logit,Df Residuals:,2200.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 24 Sep 2024",Pseudo R-squ.:,0.08876
Time:,01:52:45,Log-Likelihood:,-1350.5
converged:,True,LL-Null:,-1482.0
Covariance Type:,nonrobust,LLR p-value:,9.636e-51

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-339.5568,50.815,-6.682,0.000,-439.153,-239.961
JoiningYear,0.1695,0.025,6.719,0.000,0.120,0.219
PaymentTier,-0.1758,0.076,-2.326,0.020,-0.324,-0.028
Age,-0.0492,0.009,-5.228,0.000,-0.068,-0.031
ExperienceInCurrentDomain,-0.0305,0.029,-1.058,0.290,-0.087,0.026
Education_Masters,0.5077,0.120,4.217,0.000,0.272,0.744
Education_PHD,-0.3042,0.229,-1.330,0.184,-0.753,0.144
City_New Delhi,-0.5140,0.126,-4.077,0.000,-0.761,-0.267
City_Pune,0.4361,0.113,3.862,0.000,0.215,0.657


In [51]:
np.set_printoptions(formatter = {'float' : lambda x: '{0: 0.2f}'.format(x)})
model5.predict()

array([ 0.33,  0.41,  0.32, ...,  0.54,  0.28,  0.58])

In [53]:
np.array(y_train)

array([0, 1, 0, ..., 1, 1, 1], dtype=int64)

In [54]:
model5.pred_table(threshold = 0.5)

array([[ 1134.00,  207.00],
       [ 492.00,  378.00]])

In [55]:
cm_df = pd.DataFrame(model5.pred_table(threshold = 0.5))
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1134.0,207.0
Actual 1,492.0,378.0


In [56]:
cm = np.array(cm_df)
train_accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
train_accuracy

0.683853459972863

In [57]:
def confusion_matrix(test_data, actual_values, model):
    pred_values = model.predict(test_data)
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_values, pred_values, bins = bins)[0]
    accuracy = round((cm[0, 0] + cm[1, 1]) / cm.sum(), 4)
    return cm, accuracy

In [59]:
X_test_t = sm.add_constant(X_test)

In [60]:
cm = confusion_matrix(X_test_t, y_test, model5)
cm

(array([[ 265.00,  70.00],
        [ 112.00,  106.00]]),
 0.6709)

In [61]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0: 'Actual 0', 1: 'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,265.0,70.0
Actual 1,112.0,106.0


In [62]:
cm_df_np = np.array(cm_df)
print('Missclassificatio rate:', (cm_df_np[0, 1] + cm_df_np[1, 0]) / cm_df_np.sum())

Missclassificatio rate: 0.3291139240506329
