In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score
import joblib
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuchen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yuchen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
df = pd.read_csv("job_applicant_dataset.csv",encoding ='Windows-1252')
X = df[["Age", "Gender", "Race", "Ethnicity", "Resume", "Job Description"]]
y = df["Best Match"]
X["text"] = X["Resume"] + " " + X["Job Description"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"] = X["Resume"] + " " + X["Job Description"]


In [4]:
documents = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[i]) for i, doc in enumerate(X["text"])]
doc2vec_model = Doc2Vec(vector_size=300, window=5, min_count=2, workers=4, epochs=40)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# vectors
doc_vectors = np.array([doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X["text"]])


In [5]:
categorical = ["Gender", "Race", "Ethnicity"]
numeric = ["Age"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numeric)
    ],
    remainder="drop"
)

X_tabular = preprocessor.fit_transform(X)
X_final = np.hstack([doc_vectors, X_tabular.toarray()])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


              precision    recall  f1-score   support

           0       0.64      0.65      0.65      1023
           1       0.63      0.62      0.63       977

    accuracy                           0.64      2000
   macro avg       0.64      0.64      0.64      2000
weighted avg       0.64      0.64      0.64      2000

Accuracy: 0.637
F1 Score: 0.6265432098765432
ROC-AUC: 0.661401881595364


In [8]:
def precision_at_k(y_true, y_scores, k):
    # Convert to numpy arrays to avoid index mismatch
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    return np.mean(y_true[top_k_idx])

for k in [10, 20, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")

import numpy as np
from sklearn.metrics import confusion_matrix


def cost_matrix(y_true, y_pred, cost_fp=1, cost_fn=5):
    """
    计算模型的总成本
    cost_fp: False Positive 的成本
    cost_fn: False Negative 的成本
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    total_cost = cost_fp * fp + cost_fn * fn
    avg_cost = total_cost / len(y_true)

    print("Confusion Matrix:")
    print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Total Cost = {total_cost:.2f}")
    print(f"Average Cost per Prediction = {avg_cost:.4f}")

    # 输出 cost 矩阵形式
    matrix = np.array([[0, cost_fn],
                       [cost_fp, 0]])
    print("\nCost Matrix (Actual x Predicted):")
    print(matrix)

    return total_cost, matrix


total_cost, matrix = cost_matrix(y_test, y_pred, cost_fp=1, cost_fn=5)

P@10: 0.600
P@20: 0.700
P@50: 0.740
P@100: 0.720
Confusion Matrix:
TP=609, FP=358, FN=368, TN=665
Total Cost = 2198.00
Average Cost per Prediction = 1.0990

Cost Matrix (Actual x Predicted):
[[0 5]
 [1 0]]
