In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -q torch torchvision torchaudio scikit-learn pandas numpy

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

PREF_CSV = "same_pairs_new.csv"

df = pd.read_csv(PREF_CSV)

df = df[["jd_text", "resume_a_text", "resume_b_text", "label_b_is_better"]].dropna()
print("num rows:", len(df))

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

encoder = SentenceTransformer("intfloat/e5-base-v2", device=device)

def format_job(text):
    return "query: " + str(text)

def format_resume(text):
    return "passage: " + str(text)

job_embeddings = encoder.encode(
    [format_job(t) for t in df["jd_text"].tolist()],
    batch_size=16,
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True
)

resume_a_embeddings = encoder.encode(
    [format_resume(t) for t in df["resume_a_text"].tolist()],
    batch_size=16,
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True
)

resume_b_embeddings = encoder.encode(
    [format_resume(t) for t in df["resume_b_text"].tolist()],
    batch_size=16,
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True
)

labels_b_better = torch.tensor(df["label_b_is_better"].values, dtype=torch.float32, device=device)

print(job_embeddings.shape, resume_a_embeddings.shape, resume_b_embeddings.shape)


num rows: 200
device: cuda


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

torch.Size([200, 768]) torch.Size([200, 768]) torch.Size([200, 768])


In [None]:
from torch.utils.data import Dataset, DataLoader

class PreferenceDataset(Dataset):
    def __init__(self, job_emb, res_a_emb, res_b_emb, label_b_better):
        self.job_emb = job_emb
        self.res_a_emb = res_a_emb
        self.res_b_emb = res_b_emb
        self.label = label_b_better  # 1 if resume B is better, 0 otherwise

    def __len__(self):
        return self.job_emb.size(0)

    def __getitem__(self, idx):
        return {
            "job":  self.job_emb[idx],
            "res_a": self.res_a_emb[idx],
            "res_b": self.res_b_emb[idx],
            "label_b_better": self.label[idx],
        }

dataset = PreferenceDataset(job_embeddings, resume_a_embeddings, resume_b_embeddings, labels_b_better)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class RewardModel(nn.Module):
    def __init__(self, emb_dim=768):
        super().__init__()
        input_dim = emb_dim * 2  # job + resume
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # output a single reward score
        )

    def forward(self, job_emb, resume_emb):
        # job_emb, resume_emb: [B, 768]
        x = torch.cat([job_emb, resume_emb], dim=-1)  # [B, 1536]
        reward = self.net(x).squeeze(-1)              # [B]
        return reward

reward_model = RewardModel(emb_dim=job_embeddings.size(1)).to(device)
print(reward_model)


RewardModel(
  (net): Sequential(
    (0): Linear(in_features=1536, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [None]:
from torch.optim import Adam

optimizer = Adam(reward_model.parameters(), lr=1e-3)
bce_logits = nn.BCEWithLogitsLoss()

num_epochs = 20

for epoch in range(num_epochs):
    reward_model.train()
    total_loss = 0.0

    for batch in train_loader:
        job = batch["job"].to(device)
        res_a = batch["res_a"].to(device)
        res_b = batch["res_b"].to(device)
        label_b_better = batch["label_b_better"].to(device)  # [B], 0/1

        # 计算 reward
        r_a = reward_model(job, res_a)  # [B]
        r_b = reward_model(job, res_b)  # [B]

        # pairwise: predict "is B better" using r_b - r_a
        logits = r_b - r_a    # [B]
        loss = bce_logits(logits, label_b_better)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * job.size(0)

    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, loss={avg_loss:.4f}")


Epoch 1/20, loss=0.6936
Epoch 2/20, loss=0.6929
Epoch 3/20, loss=0.6903
Epoch 4/20, loss=0.6815
Epoch 5/20, loss=0.6628
Epoch 6/20, loss=0.6459
Epoch 7/20, loss=0.6330
Epoch 8/20, loss=0.6364
Epoch 9/20, loss=0.6143
Epoch 10/20, loss=0.6057
Epoch 11/20, loss=0.5973
Epoch 12/20, loss=0.5886
Epoch 13/20, loss=0.5791
Epoch 14/20, loss=0.5786
Epoch 15/20, loss=0.5700
Epoch 16/20, loss=0.5745
Epoch 17/20, loss=0.5614
Epoch 18/20, loss=0.5644
Epoch 19/20, loss=0.5533
Epoch 20/20, loss=0.5487


In [31]:
SAVE_PATH = "reward_model_e5base_pairwise_same_new.pt"
torch.save({
    "model_state_dict": reward_model.state_dict(),
}, SAVE_PATH)

print("saved to", SAVE_PATH)


saved to reward_model_e5base_pairwise_same_new.pt


In [33]:
import pandas as pd

df = pd.read_excel("full.xlsx")

print(df.head())


                                                id  jd_id  resume_dataset_id  \
0  2052|19918523_black_female_lakisha.jackson_full   2052           19918523   
1   2562|53129155_white_male_michael.anderson_full   2562           53129155   
2       4192|19928941_white_female_emma.kelly_full   4192           19928941   
3       422|15479281_black_female_tiana.davis_full    422           15479281   
4      3567|19796840_white_male_mark.anderson_full   3567           19796840   

               name                                        resume_text  \
0   Lakisha Jackson  Email: lakisha.jackson@apply.example.org\n\n  ...   
1  Michael Anderson  Email: michael.anderson@apply.example.org\n\n ...   
2        Emma Kelly  Email: emma.kelly@apply.example.org\n\n       ...   
3       Tiana Davis  Email: tiana.davis@apply.example.org\n\n      ...   
4     Mark Anderson  Email: mark.anderson@apply.example.org\n\n    ...   

                                resume_text_withname  \
0  Lakisha Jackson

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
reward_model.eval()

# Encode all job descriptions and resumes
job_texts = ["query: " + str(t) for t in df["job_desc"].tolist()]
resume_texts = ["passage: " + str(t) for t in df["resume_text_withname"].tolist()]

job_embs = encoder.encode(
    job_texts,
    batch_size=32,
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True
)
res_embs = encoder.encode(
    resume_texts,
    batch_size=32,
    convert_to_tensor=True,
    device=device,
    show_progress_bar=True
)

with torch.no_grad():
    rewards = reward_model(job_embs, res_embs)   # [N]

df["same_pair_score_new"] = rewards.cpu().numpy()
df.head()


Batches:   0%|          | 0/1010 [00:00<?, ?it/s]

Batches:   0%|          | 0/1010 [00:00<?, ?it/s]

Unnamed: 0,id,jd_id,resume_dataset_id,name,resume_text,resume_text_withname,job_desc,group,race,gender,label,sim_score,rank,reward_score,baseline_score,same_pair_score_new
0,2052|19918523_black_female_lakisha.jackson_full,2052,19918523,Lakisha Jackson,Email: lakisha.jackson@apply.example.org\n\n ...,Lakisha Jackson\nEmail: lakisha.jackson@apply....,Paediatric dietetics Inpatient and outpatient ...,black_female,black,female,0,0.783934,,-1.024516,0.750177,27.812244
1,2562|53129155_white_male_michael.anderson_full,2562,53129155,Michael Anderson,Email: michael.anderson@apply.example.org\n\n ...,Michael Anderson\nEmail: michael.anderson@appl...,Processing bank statements and payments Suppor...,white_male,white,male,0,0.749885,,-1.13668,0.722094,30.534901
2,4192|19928941_white_female_emma.kelly_full,4192,19928941,Emma Kelly,Email: emma.kelly@apply.example.org\n\n ...,Emma Kelly\nEmail: emma.kelly@apply.example.or...,Previous experience within a Sales role within...,white_female,white,female,1,0.854428,1.0,-1.31706,0.82032,25.528429
3,422|15479281_black_female_tiana.davis_full,422,15479281,Tiana Davis,Email: tiana.davis@apply.example.org\n\n ...,Tiana Davis\nEmail: tiana.davis@apply.example....,Supports the execution of microbiological inve...,black_female,black,female,0,0.753737,,-1.328839,0.730331,30.267616
4,3567|19796840_white_male_mark.anderson_full,3567,19796840,Mark Anderson,Email: mark.anderson@apply.example.org\n\n ...,Mark Anderson\nEmail: mark.anderson@apply.exam...,Provide world-class support and expertise to a...,white_male,white,male,0,0.841996,,-1.144655,0.802878,31.083452


In [39]:
df.columns

Index(['id', 'jd_id', 'resume_dataset_id', 'name', 'resume_text',
       'resume_text_withname', 'job_desc', 'group', 'race', 'gender', 'label',
       'sim_score', 'rank', 'reward_score', 'baseline_score',
       'same_pair_score_new'],
      dtype='object')

In [59]:
df.to_excel('full_same_pair_score_2.xlsx', index=False)

In [55]:
df_new=df[['id','name', 'group', 'race','gender','sim_score', 'baseline_score', 'same_pair_score_new']]

In [56]:
print(df_new.head())

                                                id              name  \
0  2052|19918523_black_female_lakisha.jackson_full   Lakisha Jackson   
1   2562|53129155_white_male_michael.anderson_full  Michael Anderson   
2       4192|19928941_white_female_emma.kelly_full        Emma Kelly   
3       422|15479281_black_female_tiana.davis_full       Tiana Davis   
4      3567|19796840_white_male_mark.anderson_full     Mark Anderson   

          group   race  gender  sim_score  baseline_score  same_pair_score_new  
0  black_female  black  female   0.783934        0.750177            27.812244  
1    white_male  white    male   0.749885        0.722094            30.534901  
2  white_female  white  female   0.854428        0.820320            25.528429  
3  black_female  black  female   0.753737        0.730331            30.267616  
4    white_male  white    male   0.841996        0.802878            31.083452  


In [57]:
df_new.to_csv('same_pair_score_2.csv', index=False)

In [None]:
def selection_rate_by_group(df, score_col, k=7):
    """
    df: input dataframe, must contain "job_desc" and "group" columns
    score_col: the column name of the score to rank by, e.g., 'sim_score' or 'reward_score'
    k: top-k to select per job description
    Returns: selection rate per group
    """
    # sort by job_desc and score_col
    df_sorted = df.sort_values(["job_desc", score_col], ascending=[True, False]).copy()

    # rank within each job description
    df_sorted["rank_in_job"] = df_sorted.groupby("job_desc").cumcount()

    # top-k marked as 1, others as 0
    df_sorted["selected_topk"] = (df_sorted["rank_in_job"] < k).astype(int)

    # for each group, compute selection rate
    sel_rate = df_sorted.groupby("group")["selected_topk"].mean()
    return sel_rate


In [42]:
baseline_sel = selection_rate_by_group(df, "baseline_score", k=7)
new_sel      = selection_rate_by_group(df, "same_pair_score_new", k=7)

print("Baseline selection rate:\n", baseline_sel)
print("New selection rate:\n", new_sel)
print("New - Baseline:\n", new_sel - baseline_sel)


Baseline selection rate:
 group
black_female                 0.441322
black_male                   0.428174
east_asian_female            0.561867
east_asian_male              0.580290
south_asian_indian_female    0.558128
south_asian_indian_male      0.606008
white_female                 0.512273
white_male                   0.512031
Name: selected_topk, dtype: float64
New selection rate:
 group
black_female                 0.560331
black_male                   0.576559
east_asian_female            0.521565
east_asian_male              0.584828
south_asian_indian_female    0.518227
south_asian_indian_male      0.502519
white_female                 0.462647
white_male                   0.547995
Name: selected_topk, dtype: float64
New - Baseline:
 group
black_female                 0.119008
black_male                   0.148385
east_asian_female           -0.040302
east_asian_male              0.004539
south_asian_indian_female   -0.039901
south_asian_indian_male     -0.103488
white_fema

In [None]:
def equal_opportunity_tpr(df, score_col, k=7, label_col="label"):
    """
    df: includes ['job_desc', 'group', score_col, label_col]
    score_col: the column name of the score to rank by
    label_col: the column name of the true label, where 1 indicates a positive example
    k: top-k to select per job description
    Returns: TPR (true positive rate) per group
    """
    # only consider positive examples
    pos = df[df[label_col] == 1].copy()

    # rank by job_desc and score_col
    pos_sorted = pos.sort_values(["job_desc", score_col], ascending=[True, False])

    # rank within each job description
    pos_sorted["rank_in_job"] = pos_sorted.groupby("job_desc").cumcount()
    pos_sorted["selected_topk"] = (pos_sorted["rank_in_job"] < k).astype(int)

    # for each group, compute TPR
    tpr = pos_sorted.groupby("group")["selected_topk"].mean()
    return tpr


In [45]:
baseline_tpr = equal_opportunity_tpr(df, "baseline_score", k=7)
new_tpr      = equal_opportunity_tpr(df, "same_pair_score_new", k=7)

print("Baseline TPR:\n", baseline_tpr)
print("New TPR:\n", new_tpr)
print("New - Baseline:\n", new_tpr - baseline_tpr)


Baseline TPR:
 group
black_female                 0.830972
black_male                   0.717966
east_asian_female            0.829191
east_asian_male              0.815087
south_asian_indian_female    0.846411
south_asian_indian_male      0.813138
white_female                 0.818389
white_male                   0.793415
Name: selected_topk, dtype: float64
New TPR:
 group
black_female                 0.774291
black_male                   0.736460
east_asian_female            0.833710
east_asian_male              0.839845
south_asian_indian_female    0.833971
south_asian_indian_male      0.803571
white_female                 0.830954
white_male                   0.787042
Name: selected_topk, dtype: float64
New - Baseline:
 group
black_female                -0.056680
black_male                   0.018494
east_asian_female            0.004519
east_asian_male              0.024758
south_asian_indian_female   -0.012440
south_asian_indian_male     -0.009566
white_female                 0.0

In [46]:
def calculate_fairness_metrics(df, score_col, k=7):
    """
    Calculate comprehensive fairness metrics for a given scoring method
    """
    # Get selection rates
    sel_rates = selection_rate_by_group(df, score_col, k)
    
    # Calculate fairness metrics
    groups = list(sel_rates.index)
    selection_rates = sel_rates.values
    
    # Demographic Parity metrics
    max_sr = np.max(selection_rates)
    min_sr = np.min(selection_rates)
    dpr = max_sr / min_sr if min_sr > 0 else float('inf')
    dpd = max_sr - min_sr
    
    # Equalized Odds (using TPR)
    tpr_rates = equal_opportunity_tpr(df, score_col, k)
    max_tpr = np.max(tpr_rates.values)
    min_tpr = np.min(tpr_rates.values)
    eor = max_tpr / min_tpr if min_tpr > 0 else float('inf')
    eod = max_tpr - min_tpr
    
    return {
        'selection_rates': dict(sel_rates),
        'demographic_parity_ratio': dpr,
        'demographic_parity_difference': dpd,
        'equalized_odds_ratio': eor,
        'equalized_odds_difference': eod,
        'tpr_rates': dict(tpr_rates)
    }

In [47]:
def calculate_improvement(baseline_metrics, new_metrics):
    """
    Calculate improvement percentages for all metrics
    """
    improvement = {}
    
    # For ratio metrics (lower is better)
    for metric in ['demographic_parity_ratio', 'equalized_odds_ratio']:
        baseline_val = baseline_metrics[metric]
        new_val = new_metrics[metric]
        if baseline_val > 1.0:  # Only calculate improvement if there was bias
            improvement[f'{metric}_improvement'] = ((baseline_val - new_val) / (baseline_val - 1.0)) * 100
        else:
            improvement[f'{metric}_improvement'] = 0
    
    # For difference metrics (lower is better)
    for metric in ['demographic_parity_difference', 'equalized_odds_difference']:
        baseline_val = baseline_metrics[metric]
        new_val = new_metrics[metric]
        if baseline_val > 0:
            improvement[f'{metric}_improvement'] = ((baseline_val - new_val) / baseline_val) * 100
        else:
            improvement[f'{metric}_improvement'] = 0
    
    return improvement


In [48]:
def calculate_fairness_metrics(df, score_col, k=7):
    """
    Calculate comprehensive fairness metrics for a given scoring method
    """
    # Get selection rates
    sel_rates = selection_rate_by_group(df, score_col, k)
    
    # Calculate fairness metrics
    groups = list(sel_rates.index)
    selection_rates = sel_rates.values
    
    # Demographic Parity metrics
    max_sr = np.max(selection_rates)
    min_sr = np.min(selection_rates)
    dpr = max_sr / min_sr if min_sr > 0 else float('inf')
    dpd = max_sr - min_sr
    
    # Equalized Odds (using TPR)
    tpr_rates = equal_opportunity_tpr(df, score_col, k)
    max_tpr = np.max(tpr_rates.values)
    min_tpr = np.min(tpr_rates.values)
    eor = max_tpr / min_tpr if min_tpr > 0 else float('inf')
    eod = max_tpr - min_tpr
    
    return {
        'selection_rates': dict(sel_rates),
        'demographic_parity_ratio': dpr,
        'demographic_parity_difference': dpd,
        'equalized_odds_ratio': eor,
        'equalized_odds_difference': eod,
        'tpr_rates': dict(tpr_rates),
        'groups': groups
    }

def calculate_improvement(baseline_metrics, new_metrics):
    """
    Calculate improvement percentages for all metrics
    """
    improvement = {}
    
    # For ratio metrics (lower is better)
    for metric in ['demographic_parity_ratio', 'equalized_odds_ratio']:
        baseline_val = baseline_metrics[metric]
        new_val = new_metrics[metric]
        if baseline_val > 1.0:  # Only calculate improvement if there was bias
            improvement[f'{metric}_improvement'] = ((baseline_val - new_val) / (baseline_val - 1.0)) * 100
        else:
            improvement[f'{metric}_improvement'] = 0
    
    # For difference metrics (lower is better)
    for metric in ['demographic_parity_difference', 'equalized_odds_difference']:
        baseline_val = baseline_metrics[metric]
        new_val = new_metrics[metric]
        if baseline_val > 0:
            improvement[f'{metric}_improvement'] = ((baseline_val - new_val) / baseline_val) * 100
        else:
            improvement[f'{metric}_improvement'] = 0
    
    return improvement

In [49]:
baseline_metrics = calculate_fairness_metrics(df, "baseline_score", k=7)
new_metrics = calculate_fairness_metrics(df, "same_pair_score_new", k=7)


In [50]:
print("\n--- BASELINE MODEL (Cosine Similarity) ---")
print(f"Selection Rates: {baseline_metrics['selection_rates']}")
print(f"Demographic Parity Ratio: {baseline_metrics['demographic_parity_ratio']:.4f}")
print(f"Demographic Parity Difference: {baseline_metrics['demographic_parity_difference']:.4f}")
print(f"Equalized Odds Ratio: {baseline_metrics['equalized_odds_ratio']:.4f}")
print(f"Equalized Odds Difference: {baseline_metrics['equalized_odds_difference']:.4f}")



--- BASELINE MODEL (Cosine Similarity) ---
Selection Rates: {'black_female': 0.4413223140495868, 'black_male': 0.428173719376392, 'east_asian_female': 0.5618666038180533, 'east_asian_male': 0.5802896044953534, 'south_asian_indian_female': 0.558128078817734, 'south_asian_indian_male': 0.6060077519379845, 'white_female': 0.512273212379936, 'white_male': 0.5120310478654593}
Demographic Parity Ratio: 1.4153
Demographic Parity Difference: 0.1778
Equalized Odds Ratio: 1.1789
Equalized Odds Difference: 0.1284


In [51]:
print("\n--- RLHF REWARD MODEL ---")
print(f"Selection Rates: {new_metrics['selection_rates']}")
print(f"Demographic Parity Ratio: {new_metrics['demographic_parity_ratio']:.4f}")
print(f"Demographic Parity Difference: {new_metrics['demographic_parity_difference']:.4f}")
print(f"Equalized Odds Ratio: {new_metrics['equalized_odds_ratio']:.4f}")
print(f"Equalized Odds Difference: {new_metrics['equalized_odds_difference']:.4f}")



--- RLHF REWARD MODEL ---
Selection Rates: {'black_female': 0.5603305785123966, 'black_male': 0.5765590200445434, 'east_asian_female': 0.5215649304737214, 'east_asian_male': 0.5848281824076075, 'south_asian_indian_female': 0.5182266009852217, 'south_asian_indian_male': 0.5025193798449612, 'white_female': 0.4626467449306297, 'white_male': 0.5479948253557568}
Demographic Parity Ratio: 1.2641
Demographic Parity Difference: 0.1222
Equalized Odds Ratio: 1.1404
Equalized Odds Difference: 0.1034


In [52]:
improvement = calculate_improvement(baseline_metrics, new_metrics)
print("\n--- IMPROVEMENT ANALYSIS ---")
for metric, imp in improvement.items():
    direction = "improvement" if imp > 0 else "deterioration"
    print(f"{metric}: {imp:+.1f}% ({direction})")



--- IMPROVEMENT ANALYSIS ---
demographic_parity_ratio_improvement: +36.4% (improvement)
equalized_odds_ratio_improvement: +21.5% (improvement)
demographic_parity_difference_improvement: +31.3% (improvement)
equalized_odds_difference_improvement: +19.5% (improvement)
