In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import joblib
import numpy as np
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("job_applicant_dataset.csv",encoding ='Windows-1252')
X = df[["Age", "Gender", "Race", "Ethnicity", "Resume", "Job Description"]]
y = df["Best Match"]

# Combine textual fields
X["text"] = X["Resume"] + " " + X["Job Description"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"] = X["Resume"] + " " + X["Job Description"]


In [4]:
categorical = ["Gender", "Race", "Ethnicity"]
numeric = ["Age"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numeric),
        ("txt", TfidfVectorizer(max_features=8000, ngram_range=(1,2)), "text")
    ],
    remainder="drop")

In [5]:
model = Pipeline([
    ("features", preprocessor),
    ("clf", LogisticRegression(max_iter=500, solver="liblinear"))
])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)


In [7]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1023
           1       0.64      0.65      0.65       977

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.65      0.65      0.65      2000

Accuracy: 0.652
F1 Score: 0.6452599388379205
ROC-AUC: 0.6630277416753463


In [8]:
def precision_at_k(y_true, y_scores, k):
    y_true = np.array(y_true)
    y_scores = np.array(y_scores)
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    return np.mean(y_true[top_k_idx])

for k in [10, 20, 50, 100]:
    print(f"P@{k}: {precision_at_k(y_test, y_proba, k):.3f}")

P@10: 0.500
P@20: 0.300
P@50: 0.480
P@100: 0.610


In [9]:
def cost_matrix(y_true, y_pred, cost_fp=1, cost_fn=5):
    """
    cost_fp: False Positive 
    cost_fn: False Negative 
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    total_cost = cost_fp * fp + cost_fn * fn
    avg_cost = total_cost / len(y_true)

    print("Confusion Matrix:")
    print(f"TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    print(f"Total Cost = {total_cost:.2f}")
    print(f"Average Cost per Prediction = {avg_cost:.4f}")

    matrix = np.array([[0, cost_fn],
                       [cost_fp, 0]])
    print("\nCost Matrix (Actual x Predicted):")
    print(matrix)

    return total_cost, matrix


total_cost, matrix = cost_matrix(y_test, y_pred, cost_fp=1, cost_fn=5)

Confusion Matrix:
TP=633, FP=352, FN=344, TN=671
Total Cost = 2072.00
Average Cost per Prediction = 1.0360

Cost Matrix (Actual x Predicted):
[[0 5]
 [1 0]]
