In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer
import joblib




In [4]:
df = pd.read_csv("job_applicant_dataset.csv", encoding="Windows-1252")
X = df[["Age", "Gender", "Race", "Ethnicity", "Resume", "Job Description", "Job Roles"]]
y = df["Best Match"]

In [9]:
bert_model = SentenceTransformer('all-MiniLM-L6-v2')

def get_text_embeddings(resumes, jobs, roles):
    """
    combine Resume + Job Description + Job Roles
    """
    texts = (resumes + " " + jobs + " " + roles).tolist()
    embeddings = bert_model.encode(texts, show_progress_bar=True)
    return embeddings

X_text = get_text_embeddings(X["Resume"], X["Job Description"], X["Job Roles"])

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [10]:
categorical = ["Gender", "Race", "Ethnicity"]
numeric = ["Age"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numeric)
    ],
    remainder="drop"
)

X_tabular = preprocessor.fit_transform(X)

# 拼接 BERT 向量 + 表格特征
X_final = np.hstack([X_text, X_tabular.toarray()])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

In [12]:
model = XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1023
           1       0.89      0.88      0.88       977

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000

Accuracy: 0.8875
F1 Score: 0.884080370942813
ROC-AUC: 0.8934276232126794


In [14]:
joblib.dump(model, "xgb_resume_with_roles.pkl")
joblib.dump(preprocessor, "preprocessor_with_roles.pkl")
print("saved as xgb_resume_with_roles.pkl")

saved as xgb_resume_with_roles.pkl


In [15]:
def precision_at_k(y_true, y_scores, k):
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    return np.mean(y_true[top_k_idx])

print("\n=== Precision@K ===")
for k in [10, 20, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")



=== Precision@K ===
P@10: 0.900
P@20: 0.950
P@50: 0.940
P@100: 0.930


In [16]:
def cost_matrix(y_true, y_pred, cost_fp=1, cost_fn=5):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = cost_fp * fp + cost_fn * fn
    avg_cost = total_cost / len(y_true)

    print("\n=== Confusion Matrix ===")
    print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Total Cost = {total_cost:.2f}")
    print(f"Average Cost per Prediction = {avg_cost:.4f}")

    matrix = np.array([[0, cost_fn],
                       [cost_fp, 0]])
    print("\nCost Matrix (Actual x Predicted):")
    print(matrix)
    return total_cost, matrix

total_cost, matrix = cost_matrix(y_test, y_pred, cost_fp=1, cost_fn=5)


=== Confusion Matrix ===
TP=858, FP=106, FN=119, TN=917
Total Cost = 701.00
Average Cost per Prediction = 0.3505

Cost Matrix (Actual x Predicted):
[[0 5]
 [1 0]]
