In [1]:
# import library
import pandas as pd
from collections import Counter

# ML library
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/Rietaros/kampus_merdeka/main/HR_Dataset_DS.csv')

In [3]:
data.head(3)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0


In [4]:
data.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [5]:
data['education'].value_counts()

education
Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: count, dtype: int64

In [18]:
data['previous_year_rating'].value_counts()

previous_year_rating
3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: count, dtype: int64

In [3]:
data_clean = data.dropna()

In [7]:
data_clean.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [6]:
data_clean['is_promoted'].value_counts()

is_promoted
0    44428
1     4232
Name: count, dtype: int64

In [4]:
# Select desired columns
data1 = data_clean[["department","education","gender","recruitment_channel",
             "no_of_trainings","age","previous_year_rating","length_of_service","awards_won?","avg_training_score","is_promoted"]]

In [5]:
data_encoded = pd.get_dummies(data1, prefix_sep="_")
data_encoded = data_encoded.dropna()
data_encoded.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,department_Analytics,department_Finance,department_HR,...,department_Sales & Marketing,department_Technology,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,1,35,5.0,8,0,49,0,False,False,False,...,True,False,False,False,True,True,False,False,False,True
1,1,30,5.0,4,0,60,0,False,False,False,...,False,False,True,False,False,False,True,True,False,False
2,1,34,3.0,7,0,50,0,False,False,False,...,True,False,True,False,False,False,True,False,False,True
3,2,39,1.0,10,0,50,0,False,False,False,...,True,False,True,False,False,False,True,True,False,False
4,1,45,3.0,2,0,73,0,False,False,False,...,False,True,True,False,False,False,True,True,False,False


In [11]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48660 entries, 0 to 54807
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   no_of_trainings               48660 non-null  int64  
 1   age                           48660 non-null  int64  
 2   previous_year_rating          48660 non-null  float64
 3   length_of_service             48660 non-null  int64  
 4   awards_won?                   48660 non-null  int64  
 5   avg_training_score            48660 non-null  int64  
 6   is_promoted                   48660 non-null  int64  
 7   department_Analytics          48660 non-null  bool   
 8   department_Finance            48660 non-null  bool   
 9   department_HR                 48660 non-null  bool   
 10  department_Legal              48660 non-null  bool   
 11  department_Operations         48660 non-null  bool   
 12  department_Procurement        48660 non-null  bool   
 13  depart

In [6]:
# merubah dari boolan menjadi integer
columns_to_convert = [
    'department_Analytics', 'department_Finance', 'department_HR', 'department_Legal',
    'department_Operations', 'department_Procurement', 'department_R&D',
    'department_Sales & Marketing', 'department_Technology', 'education_Bachelor\'s',
    'education_Below Secondary', 'education_Master\'s & above', 'gender_f', 'gender_m',
    'recruitment_channel_other', 'recruitment_channel_referred', 'recruitment_channel_sourcing'
]
data_encoded[columns_to_convert] = data_encoded[columns_to_convert].astype(int)

In [10]:
data_encoded.head(3)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,department_Analytics,department_Finance,department_HR,...,department_Sales & Marketing,department_Technology,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,1,35,5.0,8,0,49,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1
1,1,30,5.0,4,0,60,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
2,1,34,3.0,7,0,50,0,0,0,0,...,1,0,1,0,0,0,1,0,0,1


In [7]:
x = data_encoded.drop(columns="is_promoted")
y = data_encoded["is_promoted"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)


In [14]:
from imblearn.over_sampling import SMOTE
subset_size = 1000  # Misalnya 1000 sampel
X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=subset_size, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_subset, y_train_subset)

In [15]:
from imblearn.over_sampling import SMOTE


print("Distribusi kelas setelah SMOTE:")
print(Counter(y_train_smote))

Distribusi kelas setelah SMOTE:
Counter({0: 918, 1: 918})


In [31]:
clf = svm.SVC(C=1, kernel="rbf")
clf.fit(X_train, y_train)

In [32]:
y_predict = clf.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_predict))

print(classification_report(y_test, y_predict))

Accuracy : 0.9103986847513358
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      8860
           1       0.00      0.00      0.00       872

    accuracy                           0.91      9732
   macro avg       0.46      0.50      0.48      9732
weighted avg       0.83      0.91      0.87      9732



In [16]:
clf_smote = svm.SVC(C=1, kernel="rbf")
clf_smote.fit(X_train_smote, y_train_smote)

In [17]:
y_predict = clf_smote.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_predict))

print(classification_report(y_test, y_predict))

Accuracy : 0.7740443896424167
              precision    recall  f1-score   support

           0       0.93      0.81      0.87      8887
           1       0.17      0.40      0.24       845

    accuracy                           0.77      9732
   macro avg       0.55      0.61      0.55      9732
weighted avg       0.87      0.77      0.81      9732



In [12]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)
clf.fit(X_train, y_train)

In [13]:
y_predict = clf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_predict))
print(classification_report(y_test, y_predict))

Accuracy:  0.9391697492807234
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      8887
           1       0.96      0.31      0.47       845

    accuracy                           0.94      9732
   macro avg       0.95      0.66      0.72      9732
weighted avg       0.94      0.94      0.92      9732

