In [12]:
import tiktoken
import pandas as pd
import json

csv_file1 =  "PrivacyQA_EMNLP-master/data/policy_train_data.csv"
csv_file2 =  "PrivacyQA_EMNLP-master/data/policy_test_data.csv"
df1 = pd.read_csv(csv_file1, delimiter="\t")
df2 = pd.read_csv(csv_file1, delimiter="\t")

tokenizer = tiktoken.get_encoding("cl100k_base")  # Change encoder if needed

# Columns to tokenize
columns_to_tokenize = ["Query", "Segment"]

#Merge response segments of PrivacyQA together
merged_df = df1.groupby(["DocID", "QueryID", "Query"], as_index=False).agg({"Segment": " ".join})


# Append DocID to Query
merged_df["DocID"] = merged_df["DocID"].str.split().str[0]
merged_df["Query"] = merged_df["DocID"] + ", " + merged_df["Query"]

print(len(df1))
print(len(merged_df))
print(merged_df.head())

#Just uploading training data for now
#OpenAI format
output1 = []
for _, row in merged_df.iterrows():
    output1.append({
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": row["Query"]},
            {"role": "assistant", "content": row["Segment"]}
        ]
    })



# Save as JSONL
with open("privacyqa_tune.jsonl", "w") as f:
    for entry in output1:
        f.write(json.dumps(entry) + "\n")



tokenizer = tiktoken.get_encoding("cl100k_base")  # Use appropriate encoding for your model

def count_tokens(text):
    return len(tokenizer.encode(text))

# Check average token count
merged_df["question_tokens"] = merged_df["Query"].apply(count_tokens)
merged_df["answer_tokens"] = merged_df["Segment"].apply(count_tokens)


print(merged_df[["question_tokens", "answer_tokens"]].describe())

print(f"Tokenized dataset saved.")

185200
1350
       DocID           QueryID  \
0  Brilliant   Brilliant _26_0   
1  Brilliant   Brilliant _26_1   
2  Brilliant  Brilliant _26_10   
3  Brilliant  Brilliant _26_11   
4  Brilliant  Brilliant _26_12   

                                               Query  \
0  Brilliant, what kind of information do you col...   
1  Brilliant, who do you share the information with?   
2               Brilliant, what data do you collect?   
3           Brilliant, how long do you keep my data?   
4     Brilliant, do you sell my data to 3rd parties?   

                                             Segment  
0  Brilliant Worldwide, Inc. ("Brilliant") knows ...  
1  Brilliant Worldwide, Inc. ("Brilliant") knows ...  
2  Brilliant Worldwide, Inc. ("Brilliant") knows ...  
3  Brilliant Worldwide, Inc. ("Brilliant") knows ...  
4  Brilliant Worldwide, Inc. ("Brilliant") knows ...  
       question_tokens  answer_tokens
count      1350.000000    1350.000000
mean         12.297778    3671.481481
st

In [21]:
# Upload JSON to OpenAI
from openai import OpenAI
    
client = OpenAI(api_key="")

try:
    with open("privacyqa_tune.jsonl", "rb") as file:
        response = client.files.create(
            file=file,
            purpose="fine-tune"
        )
    file_id = response.id
    print(f"File uploaded successfully! File ID: {file_id}")
except Exception as e:
    print(f"An error occurred: {e}")

File uploaded successfully! File ID: file-5GpeJmBU7wGJEfUDSAQGT3


In [18]:
#Open AI Data preparation validation
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

data_path = "privacyqa_tune.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)



Num examples: 1002
First example:
{'role': 'system', 'content': 'You are a helpful assistant.'}
{'role': 'user', 'content': 'Brilliant, what kind of information do you collect?'}
{'role': 'assistant', 'content': 'Brilliant Worldwide, Inc. ("Brilliant") knows that you care about how your personal information is used and shared, and we take your privacy seriously. Please read the following to learn more about our Privacy Statement. By using the brilliant.org website, or any of our mobile applications (such as are available via the App Store and Google Play Store), and any related services and/or features (collectively, the "Service") you acknowledge that you accept the practices and policies outlined in this Privacy Statement. If you are a resident of the European Union ("EU"), United Kingdom, Lichtenstein, Norway, or Iceland, you may have additional rights under the EU General Data Protection Regulation (the "GDPR") with respect to your Personal Information, as outlined below. This Priv

In [19]:
#Error checks
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [20]:
### Cost Estimate ###

encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")
    
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 16385

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 232, 10986
mean / median: 3886.22754491018, 3117.0
p5 / p95: 952.0, 6943.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 205, 10944
mean / median: 3853.1017964071857, 3089.0
p5 / p95: 924.0, 6902.0

0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning
Dataset has ~3894000 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~11682000 tokens
