In [1]:
import os
import feedparser
import requests
from PyPDF2 import PdfReader
from io import BytesIO
topic = 'fine-tuning'
n = 10
local_folder = './arxiv_papers'
os.makedirs(local_folder, exist_ok=True)

def matches_conference_criteria(comment):
    conference_words = ['proceedings', 'conference', 'workshop', 'symposium',
                        'journal', 'accepted', 'to appear', 'NeurIPS', 'CVPR',
                        'ICML', 'IJCAI', 'ACL', 'ECCV', 'ICCV', 'SIGGRAPH',
                        'CHI', 'KDD', 'NIPS', 'EMNLP', 'AAAI', 'ICLR', 'EuroSys']
    return any(word in comment.lower() for word in conference_words)

def download_and_extract_text_from_pdf(url):
    response = requests.get(url)
    reader = PdfReader(BytesIO(response.content))
    text = ' '.join([page.extract_text() for page in reader.pages])
    return text

def print_paper_info(entry):
    print(f'Title: {entry["title"]}')
    print(f'Length of the Title: {len(entry["title"])}')
    print(f'Length of Abstract: {len(entry["summary"])}')

def pull_papers(n):
    response = feedparser.parse(f'http://export.arxiv.org/api/query?search_query=all:{topic}&start=0&max_results=100')
    global running_total
    count = 0
    papers = []
    print(len(response.entries))
    for entry in response.entries:
        if count >= n:
            break
        paper_comment = entry.get("arxiv_comment")
        if paper_comment and matches_conference_criteria(paper_comment):
            print_paper_info(entry)
            pdf_url = [l['href'] for l in entry.links if l['type'] == 'application/pdf'][0]
            text = download_and_extract_text_from_pdf(pdf_url)
            length = len(text)
            print(f'Length of paper: {length}\n')
            running_total += length
            count += 1
            paper = {"title": entry["title"], "abstract": entry["summary"], "content": text}
            papers.append(paper)
            print(count)
    return papers
            
running_total = 0
papers = pull_papers(n)
print("Total so far:", running_total)

33
Title: Inoculation by Fine-Tuning: A Method for Analyzing Challenge Datasets
Length of the Title: 69
Length of Abstract: 1144
Length of paper: 29536

1
Title: Transfer Fine-Tuning: A BERT Case Study
Length of the Title: 39
Length of Abstract: 1266
Length of paper: 47174

2
Title: Beyond Fine-tuning: Few-Sample Sentence Embedding Transfer
Length of the Title: 58
Length of Abstract: 885
Length of paper: 41474

3
Title: P^3 Ranker: Mitigating the Gaps between Pre-training and Ranking
  Fine-tuning with Prompt-based Learning and Pre-finetuning
Length of the Title: 124
Length of Abstract: 1166
Length of paper: 34648

4
Title: Cold-Start Data Selection for Few-shot Language Model Fine-tuning: A
  Prompt-Based Uncertainty Propagation Approach
Length of the Title: 116
Length of Abstract: 982
Length of paper: 83301

5
Title: Prototypical Fine-tuning: Towards Robust Performance Under Varying Data
  Sizes
Length of the Title: 79
Length of Abstract: 840
Length of paper: 43104

6
Title: Singular

In [2]:
from tqdm import tqdm
import json
import textwrap
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str="gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Load your paper data here.
papers_data = papers

# Initialize list to store the training examples and total token count.
training_examples = []
total_tokens = 0

for paper in tqdm(papers_data, desc="Creating training set"):
    # Extract paper details
    title = paper["title"]
    abstract = paper["abstract"]
    content = paper["content"]

    # Create the system message
    system_msg_text = f"You are a research assistant. Your job is to carefully read through papers. \
        You have read papers on {topic}, so you have domain knowledge, but you are not an expert, \
        therefore you provide careful arguments with references to specific papers, and explain your reasoning."

    # Create the user message
    user_msg_text = abstract

    # Ensure the token count doesn't exceed the limit
    system_tokens = num_tokens_from_string(system_msg_text)
    user_tokens = num_tokens_from_string(user_msg_text)
    token_space = 4000 - system_tokens - user_tokens

    # Split the content into chunks that fit into the token limit
    content_chunks = textwrap.wrap(content, width=token_space, break_long_words=False)

    for chunk in content_chunks:
        chunk_tokens = num_tokens_from_string(chunk)
        if chunk_tokens > token_space:
            print(f"Skipping paper {title} due to token count exceeding the limit.")
            continue

        total_tokens += system_tokens + user_tokens + chunk_tokens
        if total_tokens >= 50000000:  # Check the total token limit
            print('Reached total token limit for the entire training dataset.')
            break

        # Create the assistant's message
        assistant_msg = {"role": "assistant", "content": f"{chunk}, and if you want more details you should look in {title}"}

        # Add the conversation to the training examples
        example = {"messages": [{"role": "system", "content": system_msg_text}, {"role": "user", "content": user_msg_text}, assistant_msg]}
        training_examples.append(example)

        # Update the user message to continue the conversation in next example
        user_msg_text = "Could you provide more detail?"

        # Update the token space
        user_tokens = num_tokens_from_string(user_msg_text)
        token_space = 4000 - system_tokens - user_tokens

# Write the training examples to a JSONL file
with open("training_data.jsonl", "w") as f:
    for example in training_examples:
        f.write(json.dumps(example) + "\n")

print('Training data generated successfully.')

Creating training set: 100%|██████████████████████| 8/8 [00:00<00:00, 16.84it/s]

Training data generated successfully.





In [3]:
import json
papers = []
with open('training_data.jsonl') as f:
    for line in f:
        papers.append(json.loads(line))

In [5]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

data_path = "training_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 167
First example:
{'role': 'system', 'content': 'You are a research assistant. Your job is to carefully read through papers.         You have read papers on fine-tuning, so you have domain knowledge, but you are not an expert,         therefore you provide careful arguments with references to specific papers, and explain your reasoning.'}
{'role': 'user', 'content': 'Several datasets have recently been constructed to expose brittleness in\nmodels trained on existing benchmarks. While model performance on these\nchallenge datasets is significantly lower compared to the original benchmark,\nit is unclear what particular weaknesses they reveal. For example, a challenge\ndataset may be difficult because it targets phenomena that current models\ncannot capture, or because it simply exploits blind spots in a model\'s specific\ntraining set. We introduce inoculation by fine-tuning, a new analysis method\nfor studying challenge datasets by exposing models (the metaphorical patie

In [6]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [17]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0613")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [18]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 449, 2550
mean / median: 1083.4491017964071, 1028.0
p5 / p95: 895.8, 1329.2

#### Distribution of num_assistant_tokens_per_example:
min / max: 373, 2474
mean / median: 996.5988023952095, 941.0
p5 / p95: 819.8, 1228.6

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [19]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

PRICE_PER_TOKEN = 0.0080
TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print(f"At {PRICE_PER_TOKEN} this is ~{PRICE_PER_TOKEN*n_epochs * n_billing_tokens_in_dataset/100} dollars")

Dataset has ~180936 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~542808 tokens
At 0.008 this is ~43.42464 dollars


In [20]:
import openai
openai.File.create(
  file=open("training_data.jsonl", "rb"),
  purpose='fine-tune'
)

In [1]:
apikey = None

In [None]:
import subprocess
import json

curl_command = f"""curl https://api.openai.com/v1/files \
  -H "Authorization: Bearer {apikey}" \
  -F "purpose=fine-tune" \
  -F "file=@training_data.jsonl""""

# Execute the curl command
output = subprocess.run(curl_command, shell=True, capture_output=True)


# Sample output
# {
#   "object": "file",
#   "id": "file-swfwefjowkeofewiopefkw",
#   "purpose": "fine-tune",
#   "filename": "training_data.jsonl",
#   "bytes": 727141,
#   "created_at": 1695244917,
#   "status": "uploaded",
#   "status_details": null
# }



# The output will be in bytes, so decode it
decoded_output = output.stdout.decode('utf-8')

# Convert the output to a dictionary using json.loads
json_output = json.loads(decoded_output)

# Extract the file ID
file_id = json_output['id']

output = subprocess.run(curl_command, shell=True, capture_output=True)
submit_tuning = f"""curl https://api.openai.com/v1/fine_tuning/jobs \
-H "Content-Type: application/json" \
-H "Authorization: Bearer {apikey}" \ 
-d '{{
  "training_file": "{file_id}",
  "model": "gpt-3.5-turbo-0613"
}}'"""
