In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
import joblib
import random




In [2]:
df = pd.read_csv("job_applicant_dataset.csv", encoding="Windows-1252")

required_cols = ["Age", "Gender", "Race", "Ethnicity", "Resume", "Job Description", "Job Roles", "Best Match"]
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f" Missing required column: {c}")

print(f"Dataset loaded with {len(df)} records.")


Dataset loaded with 10000 records.


In [3]:
df_neg = df.copy()
df_neg["Job Description"] = np.random.permutation(df["Job Description"].values)
df_neg["Best Match"] = 0  # force negative
df_aug = pd.concat([df, df_neg], ignore_index=True)
df_aug = df_aug.sample(frac=1, random_state=42).reset_index(drop=True)
print(f" Dataset augmented: {len(df_aug)} samples total.")

 Dataset augmented: 20000 samples total.


In [4]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# Compute embeddings
resume_embeddings = bert_model.encode(df_aug["Resume"].tolist(), show_progress_bar=True)
job_embeddings = bert_model.encode(df_aug["Job Description"].tolist(), show_progress_bar=True)

# Semantic similarity feature
semantic_similarity = np.array([
    cosine_similarity([r], [j])[0][0] for r, j in zip(resume_embeddings, job_embeddings)
])

print(f" Computed semantic similarity feature (shape={semantic_similarity.shape}).")

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

 Computed semantic similarity feature (shape=(20000,)).


In [5]:
categorical = ["Gender", "Race", "Ethnicity", "Job Roles"]
numeric = ["Age", "semantic_similarity"]

# Combine features into df for preprocessing
df_aug["semantic_similarity"] = semantic_similarity
X = df_aug[["Age", "Gender", "Race", "Ethnicity", "Job Roles"]]
y = df_aug["Best Match"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numeric)
    ],
    remainder="drop"
)

X_tabular = preprocessor.fit_transform(df_aug)
X_text = np.hstack([resume_embeddings, job_embeddings])  # concatenated BERT vectors
X_final = np.hstack([X_text, X_tabular.toarray()])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)



In [7]:
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=3,  # L2 regularization
    reg_alpha=2,   # L1 regularization
    scale_pos_weight=0.7,  # handle class imbalance
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\n=== Model Evaluation ===")
print(classification_report(y_test, y_pred, digits=3))
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.3f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.3f}")
print(f"ROC-AUC:   {roc_auc_score(y_test, y_proba):.3f}")


=== Model Evaluation ===
              precision    recall  f1-score   support

           0      0.949     0.957     0.953      3030
           1      0.862     0.840     0.851       970

    accuracy                          0.928      4000
   macro avg      0.905     0.898     0.902      4000
weighted avg      0.928     0.928     0.928      4000

Accuracy:  0.928
F1 Score:  0.851
ROC-AUC:   0.955


In [9]:
def cost_matrix(y_true, y_pred, cost_fp=1, cost_fn=5):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = cost_fp * fp + cost_fn * fn
    avg_cost = total_cost / len(y_true)
    print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Total Cost = {total_cost:.2f}, Avg Cost = {avg_cost:.4f}")
    return total_cost

total_cost = cost_matrix(y_test, y_pred)

TP=815, FP=131, FN=155, TN=2899
Total Cost = 906.00, Avg Cost = 0.2265


In [10]:
def precision_at_k(y_true, y_scores, k):
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    return np.mean(y_true[top_k_idx])

for k in [10, 20, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")


P@10: 1.000
P@20: 0.900
P@50: 0.900
P@100: 0.840


In [11]:
joblib.dump(model, "xgb_resume_with_negatives.pkl")
joblib.dump(preprocessor, "preprocessor_with_negatives.pkl")
print("\n Saved model and preprocessor successfully.")


 Saved model and preprocessor successfully.
