In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_csv("data/New_Employee.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name,Last_Name,Age,Status,Join_Date,Salary,Performance_Score,Remote_Work,Department,Region,Status_code,Remote_Work_code,Performance_Score_code,Department_Code,Tenure_Date
0,0,EMP1000,Bob,Davis,25.0,Active,2021-04-02,59767.65,Average,True,DevOps,California,1,1,1,2,1615
1,1,EMP1001,Bob,Brown,33.0,Active,2020-07-10,65304.66,Excellent,True,Finance,Texas,1,1,3,3,1881
2,2,EMP1002,Alice,Jones,33.0,Pending,2023-12-07,88145.9,Good,True,Admin,Nevada,2,1,2,0,636
3,3,EMP1003,Eva,Davis,25.0,Inactive,2021-11-27,69450.99,Good,True,Admin,Nevada,0,1,2,0,1376
4,4,EMP1004,Frank,Williams,25.0,Active,2022-01-05,109324.61,Poor,False,Cloud Tech,Florida,1,0,0,1,1337


In [6]:
df=df.drop(columns=["Region"])

In [7]:
X=df.drop(columns=["Salary","Performance_Score"])
y_salary=df["Salary"]
y_perf=df["Performance_Score"]

In [8]:
le_perf = LabelEncoder()
y_perf_encoded = le_perf.fit_transform(y_perf)

In [9]:
num_features = X.select_dtypes(include=['float64', 'int32']).columns
cat_features = X.select_dtypes(include=['object','category','bool']).columns

In [10]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown="ignore"), cat_features)
])

In [11]:
x_train_reg,x_test_reg,y_train_reg,y_test_reg=train_test_split(X,y_salary,test_size=0.2,random_state=43)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X, y_perf_encoded, test_size=0.2, random_state=42)

In [12]:
reg_models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoostRegressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

print("====== REGRESSION MODELS ======")
reg_results = []

for name,model in reg_models.items():
       pipeline=Pipeline(steps=[('preprocessor',preprocessor),('model',model)])
       pipeline.fit(x_train_reg,y_train_reg)

       y_pred=pipeline.predict(x_test_reg)
       mae=mean_absolute_error(y_test_reg,y_pred)
       rmse=mean_squared_error(y_test_reg,y_pred,squared=False)
       r2=r2_score(y_test_reg,y_pred)

       reg_results.append({"Model": name, "MAE": mae, "RMSE": rmse, "R2": r2})
    
       print(f"{name} | MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.4f}")
best_reg = max(reg_results, key=lambda x: x["R2"])
print("Best Regression Model:")
print(best_reg)


LinearRegression | MAE: 16435.60 | RMSE: 19529.58 | R2: -0.0196
Lasso | MAE: 16441.34 | RMSE: 19650.08 | R2: -0.0322
Ridge | MAE: 16207.11 | RMSE: 19259.91 | R2: 0.0084
KNN Regressor | MAE: 17855.17 | RMSE: 21172.00 | R2: -0.1983
Decision Tree | MAE: 21366.42 | RMSE: 25939.40 | R2: -0.7987
Random Forest | MAE: 16850.06 | RMSE: 20065.32 | R2: -0.0763
XGBRegressor | MAE: 16088.71 | RMSE: 19406.85 | R2: -0.0068
CatBoostRegressor | MAE: 16271.07 | RMSE: 19353.36 | R2: -0.0013
AdaBoost Regressor | MAE: 17102.01 | RMSE: 20730.51 | R2: -0.1488
Best Regression Model:
{'Model': 'Ridge', 'MAE': 16207.10632927708, 'RMSE': 19259.91497017498, 'R2': 0.008381288816105337}


In [13]:
cls_models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
    "CatBoostClassifier": CatBoostClassifier(verbose=0),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "KNN Classifier": KNeighborsClassifier()
}

print("\n====== CLASSIFICATION MODELS ======")
cls_results = []

for name, model in cls_models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
        pipeline.fit(X_train_cls, y_train_cls)
    
        y_pred = pipeline.predict(X_test_cls)
        acc = accuracy_score(y_test_cls, y_pred)
    
        cls_results.append({"Model": name, "Accuracy": acc})
    
        print(f"{name} | Accuracy: {acc:.4f}")
        print(classification_report(y_test_cls, y_pred, target_names=le_perf.classes_))
best_cls = max(cls_results, key=lambda x: x["Accuracy"])
print(" Best Classification Model:")
print(best_cls)


RandomForestClassifier | Accuracy: 0.2304
              precision    recall  f1-score   support

     Average       0.25      0.30      0.27        50
   Excellent       0.16      0.25      0.20        48
        Good       0.30      0.35      0.32        54
        Poor       0.14      0.02      0.03        52

    accuracy                           0.23       204
   macro avg       0.21      0.23      0.21       204
weighted avg       0.21      0.23      0.21       204

XGBClassifier | Accuracy: 0.2647
              precision    recall  f1-score   support

     Average       0.30      0.34      0.32        50
   Excellent       0.21      0.25      0.23        48
        Good       0.32      0.37      0.34        54
        Poor       0.17      0.10      0.12        52

    accuracy                           0.26       204
   macro avg       0.25      0.26      0.25       204
weighted avg       0.25      0.26      0.25       204

CatBoostClassifier | Accuracy: 0.2402
              pr

In [15]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train_cls, y_train_cls)

ValueError: could not convert string to float: 'EMP1743'