## 04_Performance_Model.ipynb

**Purpose:**  
Train and evaluate employee performance prediction model.

**Input:**  
- `data/raw/hrms_synth_summary.csv`

**Output:**  
- Model artifacts saved to:
  - `notebooks/models/preprocessor_perf.pkl`
  - `notebooks/models/performance_model.pkl`

**Notes:**  
- Focuses on performance-related KPIs from HRMS data.  
- Output models are used during performance inference.


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier


In [2]:
BASE_PATH = r"C:\Users\abanu\Documents\t_iq_hr"

df = pd.read_csv(
    BASE_PATH + r"\data\raw\hrms_synth_summary.csv"
)

df.head()


Unnamed: 0,employee_id,name,department,job_role,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company,trainings_count
0,EMP000001,Vikram Singh,HR,Data Scientist,"New York, USA",4544478,0.78,0.8,7,12,0
1,EMP000002,Karan Patel,Marketing,Data Scientist,"Chennai, India",5180268,0.71,0.93,8,7,4
2,EMP000003,Vikram Malhotra,Marketing,Senior Software Engineer,"Chennai, India",2589268,0.81,0.56,6,3,3
3,EMP000004,Siddharth Khan,HR,ML Engineer,"Bengaluru, India",1321856,0.43,0.95,7,15,3
4,EMP000005,Priya Nair,Legal,ML Engineer,Remote,4371479,0.41,0.7,4,7,2


In [3]:
np.random.seed(42)

df["performance_score"] = (
    0.4 * df["satisfaction_score"] +
    0.4 * df["engagement_score"] +
    0.2 * (df["num_skills"] / df["num_skills"].max())
)

df["performance_score"] = pd.qcut(
    df["performance_score"],
    q=5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

df["performance_score"].value_counts().sort_index()


performance_score
1    2009
2    1992
3    2028
4    1990
5    1981
Name: count, dtype: int64

In [4]:
features = [
    'department',
    'job_role',
    'location',
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

X = df[features]
y = df['performance_score']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((8000, 9), (2000, 9))

In [5]:
num_features = [
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

cat_features = [
    'department',
    'job_role',
    'location'
]

preprocessor_perf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


In [6]:
performance_pipeline = Pipeline([
    ('preprocessor', preprocessor_perf),
    ('classifier', XGBClassifier(
        objective='multi:softmax',
        num_class=5,
        eval_metric='mlogloss',
        random_state=42
    ))
])

performance_pipeline.fit(X_train, y_train)

print("✅ Performance model trained")


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got [1 2 3 4 5]

In [7]:
df['performance_score'] = df['performance_score'] - 1


In [8]:
features = [
    'department',
    'job_role',
    'location',
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

X = df[features]
y = df['performance_score']   # now 0–4


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [10]:
performance_pipeline.fit(X_train, y_train)
print("✅ Performance model trained")


✅ Performance model trained


In [11]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = performance_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.954
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       402
           1       0.95      0.95      0.95       398
           2       0.95      0.94      0.95       406
           3       0.93      0.94      0.93       398
           4       0.97      0.96      0.97       396

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



In [13]:
import pickle
from pathlib import Path

Path("notebooks/models").mkdir(parents=True, exist_ok=True)

with open("notebooks/models/performance_model.pkl", "wb") as f:
    pickle.dump(performance_pipeline, f)

print("✅ performance_model.pkl saved")


✅ performance_model.pkl saved


In [14]:
import os
print(os.getcwd())


c:\Users\abanu\Documents\t_iq_hr\notebooks


In [15]:
sample = X_test.iloc[:5]

pred = performance_pipeline.predict(sample) + 1
actual = y_test.iloc[:5] + 1

pd.DataFrame({
    "Actual_Performance": actual.values,
    "Predicted_Performance": pred
})


Unnamed: 0,Actual_Performance,Predicted_Performance
0,1,1
1,2,2
2,3,3
3,5,5
4,1,1


In [1]:
import pandas as pd

# Load cleaned HRMS data
hrms = pd.read_csv(r"C:\Users\abanu\Documents\t_iq_hr\data\processed\HRMS_cleaned.csv")
hrms.head()


Unnamed: 0,employee_id,name,department,job_role,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company,trainings_count
0,EMP000001,Vikram Singh,HR,Data Scientist,"New York, USA",4544478,0.78,0.8,7,12,0
1,EMP000002,Karan Patel,Marketing,Data Scientist,"Chennai, India",5180268,0.71,0.93,8,7,4
2,EMP000003,Vikram Malhotra,Marketing,Senior Software Engineer,"Chennai, India",2589268,0.81,0.56,6,3,3
3,EMP000004,Siddharth Khan,HR,ML Engineer,"Bengaluru, India",1321856,0.43,0.95,7,15,3
4,EMP000005,Priya Nair,Legal,ML Engineer,Remote,4371479,0.41,0.7,4,7,2


In [2]:
import numpy as np

# If you want a synthetic score, you can use weighted combination
hrms['performance_score'] = (
    0.4 * hrms['satisfaction_score'] +
    0.4 * hrms['engagement_score'] +
    0.2 * (hrms['trainings_count'] / (hrms['trainings_count'].max() + 1))
).round().astype(int)

# Make sure values are between 1 and 5
hrms['performance_score'] = hrms['performance_score'].clip(1,5)

hrms[['employee_id','performance_score']].head()


Unnamed: 0,employee_id,performance_score
0,EMP000001,1
1,EMP000002,1
2,EMP000003,1
3,EMP000004,1
4,EMP000005,1


In [3]:
# Save HRMS with performance scores
hrms.to_csv(r"C:\Users\abanu\Documents\t_iq_hr\data\processed\HRMS_with_performance.csv", index=False)
print("✅ HRMS_with_performance.csv saved successfully!")


✅ HRMS_with_performance.csv saved successfully!


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pickle

# Features & target
features = ['department', 'job_role', 'location', 'current_salary',
            'satisfaction_score', 'engagement_score', 'num_skills', 
            'years_at_company', 'trainings_count']
X = hrms[features]
y = hrms['performance_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
categorical_features = ['department','job_role','location']
numeric_features = ['current_salary','satisfaction_score','engagement_score','num_skills','years_at_company','trainings_count']

preprocessor_perf = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Performance model pipeline
performance_pipeline = Pipeline([
    ('preprocessor', preprocessor_perf),
    ('classifier', XGBClassifier(objective='multi:softmax', num_class=5, random_state=42))
])

# Fit model
performance_pipeline.fit(X_train, y_train)
print("✅ Performance model trained")

# Save preprocessor & model
with open(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\models\preprocessor_perf.pkl", 'wb') as f:
    pickle.dump(preprocessor_perf, f)

with open(r"C:\Users\abanu\Documents\t_iq_hr\notebooks\models\performance_model.pkl", 'wb') as f:
    pickle.dump(performance_pipeline, f)

print("✅ Preprocessor and performance model saved")


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0], got [1]

In [5]:
import numpy as np

# Make performance_score values more spread out
hrms['performance_score'] = (
    0.4 * hrms['satisfaction_score'] +
    0.4 * hrms['engagement_score'] +
    0.2 * (hrms['trainings_count'] / (hrms['trainings_count'].max()+1))
) * 5  # scale up

# Round and clip to 1-5
hrms['performance_score'] = hrms['performance_score'].round().clip(1,5).astype(int)

# Check distribution
print(hrms['performance_score'].value_counts())


performance_score
3    5810
4    2263
2    1899
5      15
1      13
Name: count, dtype: int64


In [7]:
features = ['department', 'job_role', 'location', 'current_salary',
            'satisfaction_score', 'engagement_score', 'num_skills',
            'years_at_company', 'trainings_count']

X = hrms[features]
y = hrms['performance_score']


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())


X_train shape: (8000, 9)
X_test shape: (2000, 9)
y_train distribution:
 performance_score
3    4648
4    1811
2    1519
5      12
1      10
Name: count, dtype: int64


In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_cols = ['department', 'job_role', 'location']
numerical_cols = ['current_salary', 'satisfaction_score', 'engagement_score', 
                  'num_skills', 'years_at_company', 'trainings_count']

preprocessor_perf = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])


In [10]:
from xgboost import XGBClassifier

performance_pipeline = Pipeline([
    ('preprocessor', preprocessor_perf),
    ('classifier', XGBClassifier(
        objective='multi:softmax',
        num_class=5,
        random_state=42,
        use_label_encoder=False
    ))
])

performance_pipeline.fit(X_train, y_train)
print("✅ Performance model trained")


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4], got [1 2 3 4 5]

In [11]:
# Shift labels to 0-based
y_train_zero = y_train - 1
y_test_zero = y_test - 1

performance_pipeline.fit(X_train, y_train_zero)
y_pred_zero = performance_pipeline.predict(X_test)
y_pred = y_pred_zero + 1  # Shift predictions back to original


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
from sklearn.metrics import classification_report, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9765
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         3
           2       0.96      0.98      0.97       380
           3       0.99      0.97      0.98      1162
           4       0.96      0.98      0.97       452
           5       0.50      0.33      0.40         3

    accuracy                           0.98      2000
   macro avg       0.88      0.85      0.87      2000
weighted avg       0.98      0.98      0.98      2000



In [14]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from xgboost import XGBClassifier


In [15]:
df = pd.read_csv(r"C:\Users\abanu\Documents\t_iq_hr\data\processed\HRMS_with_performance.csv")

print(df.shape)
df.head()


(10000, 12)


Unnamed: 0,employee_id,name,department,job_role,location,current_salary,satisfaction_score,engagement_score,num_skills,years_at_company,trainings_count,performance_score
0,EMP000001,Vikram Singh,HR,Data Scientist,"New York, USA",4544478,0.78,0.8,7,12,0,1
1,EMP000002,Karan Patel,Marketing,Data Scientist,"Chennai, India",5180268,0.71,0.93,8,7,4,1
2,EMP000003,Vikram Malhotra,Marketing,Senior Software Engineer,"Chennai, India",2589268,0.81,0.56,6,3,3,1
3,EMP000004,Siddharth Khan,HR,ML Engineer,"Bengaluru, India",1321856,0.43,0.95,7,15,3,1
4,EMP000005,Priya Nair,Legal,ML Engineer,Remote,4371479,0.41,0.7,4,7,2,1


In [16]:
# Ensure integer type
df['performance_score'] = df['performance_score'].astype(int)

# Convert 1–5 → 0–4
df['performance_score_enc'] = df['performance_score'] - 1

df['performance_score_enc'].value_counts().sort_index()


performance_score_enc
0    10000
Name: count, dtype: int64

In [17]:
X = df.drop(columns=['performance_score', 'performance_score_enc'])
y = df['performance_score_enc']


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (8000, 11)
Test : (2000, 11)


In [19]:
numeric_features = [
    'current_salary',
    'satisfaction_score',
    'engagement_score',
    'num_skills',
    'years_at_company',
    'trainings_count'
]

categorical_features = [
    'department',
    'job_role',
    'location'
]

preprocessor_perf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


In [20]:
performance_pipeline = Pipeline([
    ('preprocessor', preprocessor_perf),
    ('classifier', XGBClassifier(
        objective='multi:softprob',
        num_class=5,
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='mlogloss'
    ))
])


In [21]:
performance_pipeline.fit(X_train, y_train)

print("✅ Performance model trained")


✅ Performance model trained


In [22]:
y_pred = performance_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [24]:
model_path = r"C:\\Users\\abanu\\Documents\\t_iq_hr\\notebooks\\models"

# Save preprocessor
with open(f"{model_path}\\preprocessor_perf.pkl", "wb") as f:
    pickle.dump(preprocessor_perf, f)

# Save full pipeline
with open(f"{model_path}\\performance_model.pkl", "wb") as f:
    pickle.dump(performance_pipeline, f)

print("✅ preprocessor_perf.pkl and performance_model.pkl saved")


✅ preprocessor_perf.pkl and performance_model.pkl saved
