In [28]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
candidate_scores = pd.read_csv('data/candidate_scores.csv')
hr_scores = pd.read_csv('data/hr_scores.csv')

# Section 1

In [3]:
# to merge by name, assumint that the name is the same in both datasets
candidate_scores["merge_name"] = (
    candidate_scores["candidateName"]
    .str.lower()
    .str.replace(r"[^a-z ]", "", regex=True)
)
hr_scores["merge_name"] = (
    hr_scores["candidateName"].str.lower().str.replace(r"[^a-z ]", "", regex=True)
)

In [4]:
# Inner merge, I left duplicate rows in candidate_scores
merged = (
    pd.merge(
        candidate_scores,
        hr_scores,
        on="merge_name",
        suffixes=("_candidate", "_hr"),
        how="inner",
    )
    .drop(columns=["merge_name", "candidateName_candidate"])
    .rename(columns={"candidateName_hr": "candidateName"})
    .reset_index(drop=True)
)

In [None]:
# numebr of duplicate rows
merged[merged["candidateName"].duplicated(keep=False)]

Unnamed: 0,id,createdAt,updatedAt,overallScore,dataSources,status,professionalScore,culturalScore,teamScore,punchline,contextID,candidateName,hrScore
34,94,38:02.3,38:36.4,75.0,"{Linkedin,CV}",Completed,79,72.4,71.5,Solid fit with room for industry-specific growth.,10,Dusk Twilightshade,4
35,95,40:59.9,41:30.6,75.0,"{Linkedin,CV}",Completed,79,74.0,75.0,Solid fit with minor domain adaptation needed.,10,Dusk Twilightshade,4
45,112,40:22.1,40:58.9,45.0,"{Linkedin,CV}",Completed,46,40.0,37.5,Technical skills not aligned with product mana...,10,Gale Windrider,4
46,113,41:21.8,42:02.3,45.0,"{Linkedin,CV}",Completed,58,35.6,42.5,Limited fit due to lack of industry and role-s...,10,Gale Windrider,4


In [46]:
px.histogram(merged, x="overallScore", color="hrScore").show()

# Section 2

In [23]:
merged["go_no_go"] = merged["hrScore"] <=2
y_true = merged["go_no_go"].map({True: 1, False: 0})
y_scores = merged["overallScore"]

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

AUC: 0.8354166666666667


In [61]:
fig = go.Figure()

# Add the ROC curve with threshold values in hover text
hover_text = [
    f"Threshold: {t:.2f}<br>FPR: {fpr:.3f}<br>TPR: {tpr:.3f}"
    for fpr, tpr, t in zip(fpr, tpr, thresholds)
]

fig.add_trace(
    go.Scatter(
        x=fpr,
        y=tpr,
        mode="lines+markers",
        name=f"ROC curve (AUC = {roc_auc:.2f})",
        line=dict(color="blue", width=2),
        hoverinfo="text",
        text=hover_text,
        marker=dict(size=6),
    )
)

# Add threshold annotations for selected points
num_annotations = 10  # Number of annotations to add
indices = np.linspace(0, len(thresholds) - 1, num_annotations, dtype=int)

for i in indices:
    fig.add_annotation(
        x=fpr[i],
        y=tpr[i],
        text=f"{thresholds[i]:.2f}",
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-20,
    )

# Add the diagonal line (random classifier)
fig.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="Random Classifier",
        line=dict(color="red", width=2, dash="dash"),
    )
)

# Update the layout
fig.update_layout(
    title="Receiver Operating Characteristic (ROC) Curve with Thresholds",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    width=800,
    height=600,
    showlegend=True,
    hovermode="closest",
)

# Show the plot
fig.show()

Using ROC value, we can see what is the best threshold (depends on what we want).
In this context, I think that the most important thing is not to recommend an inappropriate person, it means reduce the Recall (false positive).

# Section 3

Naive approach: equal parts, or fixed determine manually.
Advanced: using softmax with temperature to determine what

In [24]:


def new_overall_score(professional_score, cultural_score, team_score):
    weights = np.array([0.5, 0.3, 0.2])
    scores = np.array([professional_score, cultural_score, team_score])
    return np.dot(scores.T, weights)



merged["new_overall_score"] = new_overall_score(merged["professionalScore"], merged["culturalScore"], merged["teamScore"])

In [25]:
merged["go_no_go"] = merged["hrScore"] <= 2
y_true = merged["go_no_go"].map({True: 1, False: 0})
y_scores = merged["new_overall_score"]

fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

print(f"AUC: {roc_auc}")

AUC: 0.7479166666666667


In [26]:
fig = go.Figure()

# Add the ROC curve with threshold values in hover text
hover_text = [
    f"Threshold: {t:.2f}<br>FPR: {fpr:.3f}<br>TPR: {tpr:.3f}"
    for fpr, tpr, t in zip(fpr, tpr, thresholds)
]

fig.add_trace(
    go.Scatter(
        x=fpr,
        y=tpr,
        mode="lines+markers",
        name=f"ROC curve (AUC = {roc_auc:.2f})",
        line=dict(color="blue", width=2),
        hoverinfo="text",
        text=hover_text,
        marker=dict(size=6),
    )
)

# Add threshold annotations for selected points
num_annotations = 10  # Number of annotations to add
indices = np.linspace(0, len(thresholds) - 1, num_annotations, dtype=int)

for i in indices:
    fig.add_annotation(
        x=fpr[i],
        y=tpr[i],
        text=f"{thresholds[i]:.2f}",
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=20,
        ay=-20,
    )

# Add the diagonal line (random classifier)
fig.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="Random Classifier",
        line=dict(color="red", width=2, dash="dash"),
    )
)

# Update the layout
fig.update_layout(
    title="Receiver Operating Characteristic (ROC) Curve with Thresholds",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    width=800,
    height=600,
    showlegend=True,
    hovermode="closest",
)

# Show the plot
fig.show()

In [30]:
merged["hr_score_100"] = (4 - merged["hrScore"]) * (100/3)
y = merged["hr_score_100"]
X = np.array([merged["professionalScore"], merged["culturalScore"], merged["teamScore"]]).T
model = LinearRegression()
model.fit(X, y)

array([[79. , 74. , 75. ],
       [82. , 62. , 60. ],
       [84. , 74. , 75. ],
       [77. , 65. , 70. ],
       [79. , 69. , 69. ],
       [82. , 68. , 70. ],
       [85. , 70.6, 66.5],
       [56. , 63. , 57.5],
       [66. , 63. , 62.5],
       [80. , 78. , 77.5],
       [74. , 66. , 67.5],
       [71. , 47. , 42.5],
       [56. , 38. , 42.5],
       [78. , 58. , 60. ],
       [85. , 74. , 75. ],
       [66. , 64. , 62.5],
       [49. , 38. , 40. ],
       [78. , 65. , 60. ],
       [80. , 60. , 55. ],
       [54. , 47. , 47.5],
       [68. , 55. , 52.5],
       [79. , 69.4, 76. ],
       [79. , 66. , 67.5],
       [76. , 61. , 60. ],
       [59. , 58. , 60. ],
       [78. , 62. , 57.5],
       [84. , 65. , 62.5],
       [73. , 67. , 65. ],
       [86. , 69. , 70. ],
       [81. , 75. , 72.5],
       [72. , 54. , 52.5],
       [87. , 67. , 65. ],
       [76. , 56. , 57.5],
       [83. , 62. , 60. ],
       [79. , 72.4, 71.5],
       [79. , 74. , 75. ],
       [82. , 66. , 62.5],
 