In [1]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier




In [2]:
df = pd.read_csv("job_applicant_dataset.csv", encoding="Windows-1252")
X = df[["Age", "Gender", "Race", "Ethnicity", "Resume", "Job Description"]]
y = df["Best Match"]
X["text"] = X["Resume"].astype(str) + " " + X["Job Description"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"] = X["Resume"].astype(str) + " " + X["Job Description"].astype(str)


In [3]:
print("Loading Sentence-BERT model...")
bert_model = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")

def get_bert_embeddings(texts):
    return bert_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

print("Encoding texts...")
X_text = get_bert_embeddings(X["text"].tolist())


Loading Sentence-BERT model...
Encoding texts...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [4]:
categorical = ["Gender", "Race", "Ethnicity"]
numeric = ["Age"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numeric)
    ]
)

X_tabular = preprocessor.fit_transform(X)
X_final = np.hstack([X_text, X_tabular.toarray()])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

print(f"Data prepared: {X_train.shape[0]} train samples, {X_test.shape[0]} test samples")

Data prepared: 8000 train samples, 2000 test samples


In [6]:
def precision_at_k(y_true, y_scores, k):
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    return np.mean(y_true[top_k_idx])

def cost_matrix(y_true, y_pred, cost_fp=1, cost_fn=5):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = cost_fp * fp + cost_fn * fn
    avg_cost = total_cost / len(y_true)
    print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}, Total Cost={total_cost:.2f}, Avg Cost={avg_cost:.3f}")
    return total_cost

In [7]:
log_reg = LogisticRegression(max_iter=500)
rf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=8, eval_metric="logloss", random_state=42)

voting = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('xgb', xgb)],
    voting='soft'
)

voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
y_proba = voting.predict_proba(X_test)[:, 1]

print("\n=== Soft Voting Ensemble ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
for k in [10, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")
cost_matrix(y_test, y_pred)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    bootstrap=True,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print("\n=== Bagging (RandomForest) ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
for k in [10, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")
cost_matrix(y_test, y_pred)

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:, 1]

print("\n=== Boosting (XGBoost) ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
for k in [10, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")
cost_matrix(y_test, y_pred)

base_estimators = [
    ('lr', LogisticRegression(max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=150, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.05, eval_metric="logloss", random_state=42))
]

stack_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(max_iter=500),
    cv=5
)

stack_model.fit(X_train, y_train)
y_pred = stack_model.predict(X_test)
y_proba = stack_model.predict_proba(X_test)[:, 1]

print("\n=== Stacking Ensemble ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
for k in [10, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")
cost_matrix(y_test, y_pred)

# 先分别训练三种模型
lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=150, random_state=42).fit(X_train, y_train)
xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, eval_metric="logloss", random_state=42).fit(X_train, y_train)

p_lr = lr.predict_proba(X_test)[:, 1]
p_rf = rf.predict_proba(X_test)[:, 1]
p_xgb = xgb.predict_proba(X_test)[:, 1]

# 加权融合
p_blend = 0.2 * p_lr + 0.3 * p_rf + 0.5 * p_xgb
y_pred = (p_blend > 0.5).astype(int)

print("\n=== Weighted Blending Ensemble ===")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, p_blend))
for k in [10, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, p_blend, k):.3f}")
cost_matrix(y_test, y_pred)


=== Soft Voting Ensemble ===
              precision    recall  f1-score   support

           0       0.88      0.89      0.89      1023
           1       0.89      0.88      0.88       977

    accuracy                           0.89      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.89      0.89      0.88      2000

Accuracy: 0.885
F1 Score: 0.881808838643371
ROC-AUC: 0.8911254053394245
P@10: 0.900
P@50: 0.920
P@100: 0.930
TP=858, FP=111, FN=119, TN=912, Total Cost=706.00, Avg Cost=0.353

=== Bagging (RandomForest) ===
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      1023
           1       0.89      0.88      0.89       977

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000

Accuracy: 0.89
F1 Score: 0.8867147270854789
ROC-AUC: 0.8911143995173447
P@10: 1.000
P@50: 0.900
P@100: 0.910


691