notebook 04 : perfromance model

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


In [2]:
BASE_PATH = r"C:\Users\abanu\Documents\t_iq_hr"

df = pd.read_csv(
    BASE_PATH + r"\data\raw\hrms_synth_summary.csv"
)

df.head()


Unnamed: 0,employee_id,name,department,job_role,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company,trainings_count
0,EMP000001,Vikram Singh,HR,Data Scientist,"New York, USA",4544478,0.78,0.8,7,12,0
1,EMP000002,Karan Patel,Marketing,Data Scientist,"Chennai, India",5180268,0.71,0.93,8,7,4
2,EMP000003,Vikram Malhotra,Marketing,Senior Software Engineer,"Chennai, India",2589268,0.81,0.56,6,3,3
3,EMP000004,Siddharth Khan,HR,ML Engineer,"Bengaluru, India",1321856,0.43,0.95,7,15,3
4,EMP000005,Priya Nair,Legal,ML Engineer,Remote,4371479,0.41,0.7,4,7,2


In [3]:
np.random.seed(42)

df["performance_score"] = (
    0.4 * df["satisfaction_score"] +
    0.4 * df["engagement_score"] +
    0.2 * (df["num_skills"] / df["num_skills"].max())
)

df["performance_score"] = pd.qcut(
    df["performance_score"],
    q=5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

df["performance_score"].value_counts().sort_index()


performance_score
1    2009
2    1992
3    2028
4    1990
5    1981
Name: count, dtype: int64

In [4]:
features = [
    'department',
    'job_role',
    'location',
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

X = df[features]
y = df['performance_score']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((8000, 9), (2000, 9))

In [5]:
num_features = [
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

cat_features = [
    'department',
    'job_role',
    'location'
]

preprocessor_perf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


In [6]:
performance_pipeline = Pipeline([
    ('preprocessor', preprocessor_perf),
    ('classifier', XGBClassifier(
        objective='multi:softmax',
        num_class=5,
        eval_metric='mlogloss',
        random_state=42
    ))
])

performance_pipeline.fit(X_train, y_train)

print("✅ Performance model trained")


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got [1 2 3 4 5]

In [7]:
df['performance_score'] = df['performance_score'] - 1


In [8]:
features = [
    'department',
    'job_role',
    'location',
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

X = df[features]
y = df['performance_score']   # now 0–4


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [10]:
performance_pipeline.fit(X_train, y_train)
print("✅ Performance model trained")


✅ Performance model trained


In [11]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = performance_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.954
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       402
           1       0.95      0.95      0.95       398
           2       0.95      0.94      0.95       406
           3       0.93      0.94      0.93       398
           4       0.97      0.96      0.97       396

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [13]:
import pickle
from pathlib import Path

Path("notebooks/models").mkdir(parents=True, exist_ok=True)

with open("notebooks/models/performance_model.pkl", "wb") as f:
    pickle.dump(performance_pipeline, f)

print("✅ performance_model.pkl saved")


✅ performance_model.pkl saved


In [14]:
import os
print(os.getcwd())


c:\Users\abanu\Documents\t_iq_hr\notebooks


In [15]:
sample = X_test.iloc[:5]

pred = performance_pipeline.predict(sample) + 1
actual = y_test.iloc[:5] + 1

pd.DataFrame({
    "Actual_Performance": actual.values,
    "Predicted_Performance": pred
})


Unnamed: 0,Actual_Performance,Predicted_Performance
0,1,1
1,2,2
2,3,3
3,5,5
4,1,1
