In [1]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import re
from datasets import load_dataset, DatasetDict 

from vllm import LLM, SamplingParams

from tqdm import tqdm

from llm_fairness.utils import load_resume_data, load_resume_data_with_race, load_llm_and_tokenizer, gen_prompt, DictToObject

INFO 08-19 23:37:48 [__init__.py:244] Automatically detected platform cuda.


In [14]:
args = {
    "model": "meta-llama/Llama-3.1-8B-Instruct",
    "n_device": 1,
    "device_no": 6
}
args = DictToObject(args)
print(args)

<llm_fairness.utils.DictToObject object at 0x7fb60150eb50>


In [15]:
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.device_no)
os.environ["HF_HOME"] = "/local/zemel/hf/"

In [None]:
llm, tokenizer, _ = load_llm_and_tokenizer(model_name=args.model, n_device=args.n_device)

In [None]:
# df = load_resume_data_with_race()
# df['resume'] = df.apply(lambda row: row['resume'].replace('[NAME]', row['Name']), axis=1)
# df['resume'] = df.apply(lambda row: row['resume'].replace('[EMAIL]', row['Email']), axis=1)
# df.to_csv('/local/zemel/arvind/code/llm_fairness/data/generated_resumes_with_namesinresume.csv', index=False)

In [17]:
df = pd.read_csv("/local/zemel/arvind/code/llm_fairness/data/generated_resumes_with_namesinresume.csv")


In [None]:
system_prompt = "You are a resume reviewer."

# test_prompts = [
#     f"Summarize the following resume in 2-3 sentences maximum. Use the placeholder [NAME] for the person’s name. Only return the summary, i.e., do not say ‘Here is the summary...\n\n{resume}\n\n"
#     for resume in df['resume']
# ]

test_prompts = [
    f"Summarize the following resume in 2-3 sentences maximum. Use the placeholder [NAME] for the person’s name. Only return the summary, i.e., do not say ‘Here is the summary...\n\n{resume}\n\nPlease provide 5 distinct responses. Format:\n1.\n2.\n3.\n4.\n5."
    for resume in df['resume']
]

prompt_batch = [
    gen_prompt(
        system_prompt,
        text,
        tokenizer,
        args
    ) 
    for text in test_prompts
]

for prompt in prompt_batch[2:3]:
    print(prompt)
    print("---------\n\n")

In [None]:
out = llm.generate(
    prompt_batch,
    SamplingParams(
        max_tokens=768,
        temperature=0.80,
        n=1,
    ),
)

In [20]:
print(out[0].outputs)

[CompletionOutput(index=0, text='1. [NAME], a dedicated and experienced public service professional with a strong background in law enforcement and emergency response, seeks a Police Officer position in the New York Police Department.\n2. [NAME] brings over 10 years of experience in public safety and security, along with proficiency in modern policing technology and data analysis software, to a potential position as a Police Officer.\n3. A skilled and community-focused individual, [NAME] is a motivated candidate for a Police Officer role, leveraging strengths in communication, critical judgment, and technology expertise to promote public safety and order.\n4. With a strong academic background in Criminal Justice and diverse professional experience, [NAME] is a well-rounded candidate for a Police Officer position, offering high agreeableness and effective communication skills.\n5. [NAME] is a seasoned public safety professional with a passion for community engagement and strategic decis

In [None]:
responses = [o.outputs[0].text for o in out]

for response in responses:
    print(response)
    print("\n---------\n")


In [None]:
responses_df = pd.DataFrame(responses, columns=['responses'])

# Extract all numbered responses into rows
extracted = responses_df['responses'].str.extractall(r'\d+\.\s*(.*)')

# Pivot back to wide format
result = extracted.unstack().droplevel(0, axis=1)

# Rename columns
result.columns = [f'response_{i+1}' for i in range(result.shape[1])]

# Combine with original df if needed
responses_df = responses_df.join(result)

print(responses_df.iloc[0])

responses_df.to_csv('/local/zemel/arvind/code/llm_fairness/data/generated_summaries.csv', index=False)

In [3]:
responses_df = pd.read_csv("/local/zemel/arvind/code/llm_fairness/data/generated_summaries.csv")
responses_df = responses_df.drop(columns=['responses'])
print(responses_df.iloc[0])

response_1    [NAME], a dedicated and experienced public ser...
response_2    [NAME] brings over 10 years of experience in p...
response_3    A skilled and community-focused individual, [N...
response_4    With a strong academic background in Criminal ...
response_5    [NAME] is a seasoned public safety professiona...
Name: 0, dtype: object


In [37]:
system_prompt = "You are a resume reviewer."


test_prompts = [
    f"Score these 5 resumes for a job opening for on a scale of 1-10. Answers should be formatted like 1.[[1]] 2.[[3]] 3.[[7]] 4.[[10]] 5.[[1]]. Only respond with the score, no explanation or other text.\n\n{responses}\n\n"
    for responses in responses_df['responses']
]

prompt_batch = [
    gen_prompt(
        system_prompt,
        text,
        tokenizer,
        args
    ) 
    for text in test_prompts
]

for prompt in prompt_batch[2:3]:
    print(prompt)
    print("---------\n\n")

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a resume reviewer.<|eot_id|><|start_header_id|>user<|end_header_id|>

Score these 5 resumes for a job opening for on a scale of 1-10. Answers should be formatted like 1.[[1]] 2.[[3]] 3.[[7]] 4.[[10]] 5.[[1]]. Only respond with the score, no explanation or other text.

1. [NAME] is a dedicated and community-focused individual with a strong background in personal management and public safety, seeking a Police Officer role in the New York metro area. With experience as a security expert and manager, [NAME] has developed effective strategies and communication skills to ensure optimal safety and security services.

2. [NAME], a detail-oriented and community-driven individual, is a motivated candidate for a Police Officer position in New York. He/she has a strong educational foundation in Criminal Justice and extensive experience in security management, where [N

In [None]:
out_scores = llm.generate(
    prompt_batch,
    SamplingParams(
        max_tokens=768,
        temperature=0.75,
        n=1,
    ),
)

In [None]:
scores = [o.outputs[0].text for o in out_scores]

for score in scores:
    print(score)
    print("\n---------\n")

scores_df = pd.DataFrame(scores, columns=['scores'])
scores_df.to_csv('/local/zemel/arvind/code/llm_fairness/data/generated_scores.csv', index=False)

In [None]:
scores_df = pd.read_csv("/local/zemel/arvind/code/llm_fairness/data/generated_scores.csv")
scores_df = scores_df.squeeze()
numbers_expanded = (
    scores_df.str.findall(r'\d+')        # find all numbers as strings
     .apply(lambda lst: lst[1::2]) # take only value numbers (skip the first in each pair)
     .apply(lambda x: list(map(int, x))) # convert to int
)



False
False


In [4]:
scores_df = pd.read_csv("/local/zemel/arvind/code/llm_fairness/data/generated_scores.csv")
scores_df = scores_df.squeeze()
print(scores_df.shape)
print(scores_df.iloc[0],'\n---------\n')
print(scores_df.iloc[1])

(2000,)
1.[[3]]
2.[[8]]
3.[[7]]
4.[[6]]
5.[[9]] 
---------

1.[[3]] 
2.[[8]]
3.[[7]]
4.[[4]]
5.[[9]]


In [5]:
# 1. Extract numbers and expand into columns
numbers_expanded = (
    scores_df.str.findall(r'\d+')        # find all numbers as strings
     .apply(lambda lst: lst[1::2]) # take only value numbers (skip the first in each pair)
     .apply(lambda x: list(map(int, x))) # convert to int
)

numbers_df = pd.DataFrame(numbers_expanded.tolist(), index=scores_df.index)
numbers_df.columns = responses_df.columns  # align columns with df

# 2. Get positions of max and min using idxmax/idxmin
max_cols = numbers_df.idxmax(axis=1)
min_cols = numbers_df.idxmin(axis=1)



  max_cols = numbers_df.idxmax(axis=1)
  min_cols = numbers_df.idxmin(axis=1)


In [14]:
# Convert responses_df to NumPy for fast indexing
data = responses_df.to_numpy()
col_index = responses_df.columns.get_indexer

# Row indices for selection
rows = np.arange(len(responses_df))

# Get column positions for max and min
max_idx = col_index(max_cols)
min_idx = col_index(min_cols)

# Select values
dpo_training_pairs = pd.DataFrame({
    'max_value': data[rows, max_idx],
    'min_value': data[rows, min_idx]
})

print(dpo_training_pairs.iloc[0])

dpo_training_pairs.to_csv('/local/zemel/arvind/code/llm_fairness/data/dpo_training_pairs.csv', index=False)

max_value    [NAME] is a seasoned public safety professiona...
min_value    [NAME], a dedicated and experienced public ser...
Name: 0, dtype: object


In [17]:
dpo_training_pairs = pd.read_csv("/local/zemel/arvind/code/llm_fairness/data/dpo_training_pairs.csv")

In [2]:
dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

In [15]:
score_str = '1.[[4]] 2.[[8]] 3.[[8]] 4.[[7]] 5.[[9]]'
list(map(int, re.findall(r'\[\[?(\d+)\]?\]', score_str)))

[4, 8, 8, 7, 9]

In [16]:
score_str = '1. [8]  2. [8] 3. [9] 4. [9] 5. [9]'
list(map(int, re.findall(r'\[\[?(\d+)\]?\]', score_str)))

[8, 8, 9, 9, 9]