In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('HR_comma_sep.csv')

In [None]:
df.shape

(14999, 10)

In [None]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [None]:
df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
Department                object
salary                    object
dtype: object

# Target column -> Salary
- Label encode salary column

In [None]:
df['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [None]:
df['Department'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [None]:
df.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [None]:
label_map = {
    "salary" : {'low':0,'medium':1,'high':2}
}

In [None]:
df.replace(label_map,inplace=True)

In [None]:
df.dtypes

satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
left                       int64
promotion_last_5years      int64
Department                object
salary                     int64
dtype: object

In [None]:
df_ohe = pd.get_dummies(df)

In [None]:
df_ohe.shape

(14999, 19)

In [None]:
df_ohe.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'salary', 'Department_IT', 'Department_RandD',
       'Department_accounting', 'Department_hr', 'Department_management',
       'Department_marketing', 'Department_product_mng', 'Department_sales',
       'Department_support', 'Department_technical'],
      dtype='object')

# X & Y split

In [None]:
X = df_ohe.drop('salary',axis=1)
Y = df_ohe['salary']

In [None]:
X.shape,Y.shape

((14999, 18), (14999,))

# Train - Test split


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=34,stratify=Y)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((10499, 18), (4500, 18), (10499,), (4500,))

In [None]:
X_train.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department_IT,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
count,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0,10499.0
mean,0.613078,0.716288,3.804267,201.292599,3.496523,0.142585,0.237165,0.020859,0.081341,0.055148,0.050195,0.050481,0.042385,0.056196,0.061149,0.274026,0.145347,0.183732
std,0.249904,0.170354,1.234728,49.972005,1.461991,0.349666,0.425365,0.142919,0.273371,0.22828,0.218358,0.218946,0.201475,0.23031,0.239614,0.446043,0.352467,0.387284
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.65,0.72,4.0,201.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Apply random forest

In [None]:
# ?
y_train.value_counts(normalize = True)

0    0.487761
1    0.429755
2    0.082484
Name: salary, dtype: float64

In [None]:
from sklearn.ensemble import RandomForestClassifier
# using RF classifier bcz target variable  is categorical

In [None]:
rf = RandomForestClassifier(n_estimators=200,
                            oob_score=True,
                            class_weight='balanced',
                            random_state=34)

In [None]:
# training the model
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))
# compares testing and predicted values


              precision    recall  f1-score   support

           0       0.62      0.71      0.66      2195
           1       0.60      0.57      0.59      1934
           2       0.76      0.33      0.46       371

    accuracy                           0.62      4500
   macro avg       0.66      0.53      0.57      4500
weighted avg       0.62      0.62      0.61      4500



In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(n_estimators=200,random_state=34)

In [None]:
xgb.fit(X_train,y_train)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.61      0.67      0.64      2195
           1       0.57      0.57      0.57      1934
           2       0.63      0.26      0.37       371

    accuracy                           0.59      4500
   macro avg       0.60      0.50      0.53      4500
weighted avg       0.60      0.59      0.59      4500



In [None]:
!pip install catboost



In [None]:
import catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier(iterations=200,
                        random_seed=34)
                        #class_weights='balanced')

In [None]:
cb.fit(X_train,y_train)

Learning rate set to 0.347561
0:	learn: 1.0036557	total: 5.72ms	remaining: 1.14s
1:	learn: 0.9539722	total: 9.25ms	remaining: 916ms
2:	learn: 0.9217084	total: 14.5ms	remaining: 953ms
3:	learn: 0.9054096	total: 23.5ms	remaining: 1.15s
4:	learn: 0.8933176	total: 29.9ms	remaining: 1.16s
5:	learn: 0.8853531	total: 35.7ms	remaining: 1.15s
6:	learn: 0.8801713	total: 40.6ms	remaining: 1.12s
7:	learn: 0.8766862	total: 45.8ms	remaining: 1.1s
8:	learn: 0.8749274	total: 51.2ms	remaining: 1.09s
9:	learn: 0.8728102	total: 56.6ms	remaining: 1.08s
10:	learn: 0.8714153	total: 61.9ms	remaining: 1.06s
11:	learn: 0.8694469	total: 69.9ms	remaining: 1.09s
12:	learn: 0.8678239	total: 75.8ms	remaining: 1.09s
13:	learn: 0.8677965	total: 79.6ms	remaining: 1.06s
14:	learn: 0.8660872	total: 85.3ms	remaining: 1.05s
15:	learn: 0.8632901	total: 91ms	remaining: 1.05s
16:	learn: 0.8610091	total: 96.7ms	remaining: 1.04s
17:	learn: 0.8600526	total: 102ms	remaining: 1.03s
18:	learn: 0.8582094	total: 107ms	remaining: 1.0

<catboost.core.CatBoostClassifier at 0x7b9511f32d10>

In [None]:
y_pred = cb.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.56      0.67      0.61      2195
           1       0.52      0.47      0.49      1934
           2       0.52      0.12      0.20       371

    accuracy                           0.54      4500
   macro avg       0.53      0.42      0.43      4500
weighted avg       0.54      0.54      0.53      4500



# Performing catboost without preprocessing

In [None]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [None]:
df['salary'].head()

0    0
1    1
2    1
3    0
4    0
Name: salary, dtype: int64

In [None]:
X1 = df.drop('salary',axis=1)
Y1 = df['salary']

In [None]:
X1.shape,Y1.shape

((14999, 9), (14999,))

In [None]:
X1_train,X1_test,y1_train,y1_test = train_test_split(X1,Y1,test_size=0.3,random_state=34,stratify=Y1)

In [None]:
X1_train.shape,X1_test.shape,y1_train.shape,y1_test.shape

((10499, 9), (4500, 9), (10499,), (4500,))

In [None]:
cbc_raw = CatBoostClassifier(iterations=200,random_seed = 34,cat_features=['Department'])

In [None]:
cbc_raw.fit(X1_train,y1_train)

Learning rate set to 0.347561
0:	learn: 1.0041301	total: 34.3ms	remaining: 6.83s
1:	learn: 0.9532949	total: 64.2ms	remaining: 6.35s
2:	learn: 0.9247607	total: 90.1ms	remaining: 5.92s
3:	learn: 0.9074776	total: 103ms	remaining: 5.05s
4:	learn: 0.9000673	total: 109ms	remaining: 4.26s
5:	learn: 0.8910541	total: 123ms	remaining: 3.97s
6:	learn: 0.8862566	total: 144ms	remaining: 3.97s
7:	learn: 0.8827210	total: 159ms	remaining: 3.82s
8:	learn: 0.8811974	total: 172ms	remaining: 3.66s
9:	learn: 0.8795398	total: 187ms	remaining: 3.56s
10:	learn: 0.8778436	total: 207ms	remaining: 3.55s
11:	learn: 0.8762815	total: 221ms	remaining: 3.46s
12:	learn: 0.8742436	total: 235ms	remaining: 3.39s
13:	learn: 0.8726757	total: 253ms	remaining: 3.37s
14:	learn: 0.8702712	total: 285ms	remaining: 3.51s
15:	learn: 0.8702184	total: 292ms	remaining: 3.36s
16:	learn: 0.8686634	total: 308ms	remaining: 3.31s
17:	learn: 0.8675304	total: 323ms	remaining: 3.26s
18:	learn: 0.8667250	total: 337ms	remaining: 3.21s
19:	lear

<catboost.core.CatBoostClassifier at 0x7b9511f74ee0>

In [None]:
y1_pred = cbc_raw.predict(X1_test)

In [None]:
print(classification_report(y1_test,y1_pred))

              precision    recall  f1-score   support

           0       0.56      0.66      0.60      2195
           1       0.52      0.49      0.50      1934
           2       0.56      0.12      0.20       371

    accuracy                           0.54      4500
   macro avg       0.55      0.42      0.43      4500
weighted avg       0.54      0.54      0.53      4500



In [None]:
# On Random forest,
# create at least 3 models with different n_estimators , max_samples, max_features, max_depth

In [None]:
rf1 = RandomForestClassifier(n_estimators=300,max_samples=0.4,max_features=6,max_depth=8)

In [None]:
rf1.fit(X_train,y_train)

In [None]:
y_pred = rf1.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      0.86      0.64      2195
           1       0.53      0.22      0.31      1934
           2       0.62      0.09      0.16       371

    accuracy                           0.52      4500
   macro avg       0.55      0.39      0.37      4500
weighted avg       0.53      0.52      0.46      4500



In [None]:
rf2 = RandomForestClassifier(n_estimators=500,max_samples=0.6,max_features=8,max_depth=12)

In [None]:
rf2.fit(X_train,y_train)

In [None]:
y_pred = rf2.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.79      0.65      2195
           1       0.57      0.39      0.46      1934
           2       0.65      0.14      0.23       371

    accuracy                           0.56      4500
   macro avg       0.59      0.44      0.45      4500
weighted avg       0.57      0.56      0.53      4500



In [None]:
rf3 = RandomForestClassifier(n_estimators=750,max_samples=0.7,max_features=17,oob_score=True,max_depth=7,class_weight='balanced')

In [None]:
rf3.fit(X_train,y_train)

In [None]:
y_pred = rf3.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.29      0.40      2195
           1       0.47      0.58      0.52      1934
           2       0.14      0.43      0.21       371

    accuracy                           0.43      4500
   macro avg       0.42      0.43      0.38      4500
weighted avg       0.52      0.43      0.44      4500

