In [76]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix,roc_curve,root_mean_squared_error
from xgboost import XGBClassifier
import pickle

In [75]:
def preprocess_data(df):
    print("Preprocessing the data...")
    df = df.fillna(method='ffill')
    df.columns = df.columns.str.lower()
    df = df.drop_duplicates().reset_index(drop=True)
    df["attrition"]  = df["attrition"].map({'Yes' : 1 , 'No' : 0 })
    df =  df.drop(['employeecount' , 'standardhours' , 'over18','employeenumber'], axis=1)
    df["education"] = df["education"].replace({1:"Below College",2:"College",3:"Bachelor",4:"Master",5:"Doctor"})
    df["environmentsatisfaction"] = df["environmentsatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
    df["jobinvolvement"] = df["jobinvolvement"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
    df["joblevel"] = df["joblevel"].replace({1:"Entry Level",2:"Junior Level",3:"Mid Level",4:"Senior Level", 5:"Executive Level"})
    df["jobsatisfaction"] = df["jobsatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
    df["performancerating"] = df["performancerating"].replace({1:"Low",2:"Good",3:"Excellent",4:"Outstanding"})
    df["relationshipsatisfaction"] = df["relationshipsatisfaction"].replace({1:"Low",2:"Medium",3:"High",4:"Very High"})
    df["worklifebalance"] = df["worklifebalance"].replace({1:"Bad",2:"Good",3:"Better",4:"Best"})
    x = df.drop('attrition', axis=1)
    y = df['attrition']
    print("✅ Data preprocessing completed.")

    return df, x, y

In [54]:
def split_and_vectorize(x, y,):
    print("Splitting the data into training and testing sets...")
    X_train, X_test,Y_train, Y_test = train_test_split(x,y, test_size=0.2, random_state=42,stratify=y)
    dv = DictVectorizer(sparse=False)
    train_dicts = X_train.to_dict(orient='records')
    test_dicts = X_test.to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)
    X_test = dv.transform(test_dicts)
    features = dv.get_feature_names_out().tolist()
    print("✅ Data splitting and vectorization completed.")
    return X_train, X_test, Y_train, Y_test, features, dv

In [55]:
learning_rate=0.05
max_depth=4
n_estimators=200
objective='binary:logistic'
eval_metric='logloss'
use_label_encoder=False
random_state=42
subsample=0.8
colsample_bytree=0.8

In [None]:
def train_xgboost(X_train,Y_train,learning_rate, max_depth, n_estimators, objective, eval_metric, use_label_encoder, random_state, subsample, colsample_bytree):
    print("Training the XGBoost model...")
    
    model = XGBClassifier(
        learning_rate=learning_rate,
        max_depth=max_depth,
        n_estimators=n_estimators,
        objective=objective,
        eval_metric=eval_metric,
        use_label_encoder=use_label_encoder,
        random_state=random_state,
        subsample=subsample,
        colsample_bytree=colsample_bytree
    )
    model.fit(X_train, Y_train)
    print("✅ Model training completed.")
    return model

In [67]:
def evaluate(model, X_train, X_test, Y_train, Y_test):
    Y_test_pred = model.predict(X_test)
    Y_train_pred = model.predict(X_train)
    Y_test_proba = model.predict_proba(X_test)[:, 1]
    

    print("✅ TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(Y_train, Y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(Y_train, Y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(Y_train, Y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print( "✅TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(Y_test, Y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(Y_test, Y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(Y_test, Y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")
    print("\n")
    print("\n")
    print("\n=========ROC AUC SCORE============ ")
    auc = roc_auc_score(Y_test, Y_test_proba)
    print(f"ROC AUC SCORE: {auc:.4f}")

    print("✅Model evaluation completed.")



In [58]:
def model_saving(model, dv,df, model_path='xgboost_model.bin', dv_path='dv.bin',data_path='preprocessed_data.csv'):
    print("Saving the model and DictVectorizer...")
    df.to_csv(data_path, index=False)
    with open(model_path, 'wb') as f_out:
        pickle.dump(model, f_out)
    with open(dv_path, 'wb') as f_out:
        pickle.dump(dv, f_out)     
    print(f"✅ Model saved to {model_path}")
    print(f"✅ DictVectorizer saved to {dv_path}")
    print(f"✅ Cleaned dataset saved to {data_path}")

In [77]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [78]:
preprocess_data(df)

Preprocessing the data...
✅ Data preprocessing completed.


  df = df.fillna(method='ffill')


(      age  attrition     businesstravel  dailyrate              department  \
 0      41          1      Travel_Rarely       1102                   Sales   
 1      49          0  Travel_Frequently        279  Research & Development   
 2      37          1      Travel_Rarely       1373  Research & Development   
 3      33          0  Travel_Frequently       1392  Research & Development   
 4      27          0      Travel_Rarely        591  Research & Development   
 ...   ...        ...                ...        ...                     ...   
 1465   36          0  Travel_Frequently        884  Research & Development   
 1466   39          0      Travel_Rarely        613  Research & Development   
 1467   27          0      Travel_Rarely        155  Research & Development   
 1468   49          0  Travel_Frequently       1023                   Sales   
 1469   34          0      Travel_Rarely        628  Research & Development   
 
       distancefromhome      education educationfi

In [79]:
df, x, y = preprocess_data(df)

Preprocessing the data...
✅ Data preprocessing completed.


  df = df.fillna(method='ffill')


In [80]:
X_train, X_test, Y_train, Y_test, features,dv

(array([[47.,  0.,  0., ...,  2.,  1.,  2.],
        [22.,  0.,  0., ...,  1.,  2.,  1.],
        [46.,  0.,  0., ...,  9.,  4.,  9.],
        ...,
        [22.,  0.,  0., ...,  3.,  1.,  1.],
        [36.,  0.,  0., ...,  0.,  0.,  0.],
        [39.,  0.,  1., ...,  1.,  2.,  2.]], shape=(1176, 75)),
 array([[24.,  1.,  0., ...,  0.,  0.,  0.],
        [44.,  0.,  0., ...,  5.,  7.,  7.],
        [31.,  0.,  0., ...,  4.,  0.,  1.],
        ...,
        [36.,  0.,  0., ...,  2.,  2.,  2.],
        [56.,  0.,  0., ...,  2.,  1.,  0.],
        [37.,  0.,  0., ..., 12.,  5.,  7.]], shape=(294, 75)),
 1194    0
 128     0
 810     0
 478     0
 491     0
        ..
 1213    1
 963     0
 734     0
 1315    0
 1292    0
 Name: attrition, Length: 1176, dtype: int64,
 1061    0
 891     0
 456     0
 922     0
 69      1
        ..
 1269    0
 1352    0
 1236    1
 1023    0
 285     0
 Name: attrition, Length: 294, dtype: int64,
 ['age',
  'businesstravel=Non-Travel',
  'businesstravel=Trav

In [81]:
X_train, X_test, Y_train, Y_test, features,dv = split_and_vectorize(x, y)

Splitting the data into training and testing sets...
✅ Data splitting and vectorization completed.


In [82]:
model = train_xgboost(X_train,Y_train,learning_rate, max_depth, n_estimators, objective, eval_metric, use_label_encoder, random_state, subsample, colsample_bytree)

Training the XGBoost model...
✅ Model training completed.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [86]:
evaluate(model, X_train, X_test, Y_train, Y_test)

✅ TRAINIG RESULTS: 
CONFUSION MATRIX:
[[986   0]
 [ 26 164]]
ACCURACY SCORE:
0.9779
CLASSIFICATION REPORT:
                    0           1  accuracy    macro avg  weighted avg
precision    0.974308    1.000000  0.977891     0.987154      0.978459
recall       1.000000    0.863158  0.977891     0.931579      0.977891
f1-score     0.986987    0.926554  0.977891     0.956770      0.977223
support    986.000000  190.000000  0.977891  1176.000000   1176.000000
✅TESTING RESULTS: 
CONFUSION MATRIX:
[[244   3]
 [ 38   9]]
ACCURACY SCORE:
0.8605
CLASSIFICATION REPORT:
                    0          1  accuracy   macro avg  weighted avg
precision    0.865248   0.750000  0.860544    0.807624      0.846824
recall       0.987854   0.191489  0.860544    0.589672      0.860544
f1-score     0.922495   0.305085  0.860544    0.613790      0.823794
support    247.000000  47.000000  0.860544  294.000000    294.000000





ROC AUC SCORE: 0.7907
✅Model evaluation completed.


In [88]:
model_saving(model, dv,df)

Saving the model and DictVectorizer...
✅ Model saved to xgboost_model.bin
✅ DictVectorizer saved to dv.bin
✅ Cleaned dataset saved to preprocessed_data.csv
