In [1]:
import json
import pandas as pd
import anthropic
from tqdm.auto import tqdm

In [2]:
with open('./data/documents.json', 'r') as f_in:
    documents = json.load(f_in)

In [35]:
def generate_prompt(doc):
    prompt_template = """
    Please emulate a contributor to an open source software project.
    Formulate {num_questions} questions this contributor might ask based on a FAQ type record.
    Make the questions relevant to the page_content. The headers are releated, but the last header is much more
    relevant to the current page_content.
    The record should contain the answer to the questions, and the questions should be complete and not
    too short. If possible, use as fewer words as possible from the record. Also, the records for each
    header may contain some or all of the record headers. Treat the headers as metadata.

    The record:

    page_content: {page_content}
    header_1: {header_1}
    header_2: {header_2}
    header_3: {header_3}
    header_4: {header_4}
    header_5: {header_5}

    Provide the output in parsable JSON without using code blocks, such as:

    {{"questions": ["question1", "question2", ..., "question5"]}}
    """.strip()

    # Gnerate 3 or 5 questions based on page content length
    content_length = len(doc.get('page_content', ''))
    num_questions = 3 if content_length < 600 else 5

    # Use doc.get() with default values to avoid KeyError
    prompt = prompt_template.format(
        num_questions=num_questions,
        page_content=doc.get('page_content', ''),
        header_1=doc.get('header_1', ''),
        header_2=doc.get('header_2', ''),
        header_3=doc.get('header_3', ''),
        header_4=doc.get('header_4', ''),
        header_5=doc.get('header_5', '')
    )

    return prompt

In [25]:
prompt = generate_prompt(documents[3])

In [26]:
documents[3]

{'id': '30f244ee-b626-47b8-8775-9b1f539487ad',
 'page_content': 'Before creating a copy to your local machine, you must have Git installed. You can find instructions for installing Git for your operating system [**here**](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).  \nThe following steps will clone (create) a local copy of the forked repository on your computer.',
 'header_1': '**How to Contribute to Hack for LA**',
 'header_2': '**Part 1: Setting up the development environment**',
 'header_3': '**1.4 Dev setup (4): Clone (Create) a copy on your computer**'}

In [27]:
print(prompt)

Please emulate a contributor to an open source software project.
    Formulate 3 questions this contributor might ask based on a FAQ type record.
    Make the questions relevant to the page_content. The headers are releated, but the last header is much more
    relevant to the current page_content.
    The record should contain the answer to the questions, and the questions should be complete and not
    too short. If possible, use as fewer words as possible from the record. Also, the records for each
    header may contain some or all of the record headers. Treat the headers as metadata.

    The record:

    page_content: Before creating a copy to your local machine, you must have Git installed. You can find instructions for installing Git for your operating system [**here**](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).  
The following steps will clone (create) a local copy of the forked repository on your computer.
    header_1: **How to Contribute to Hack for LA*

### Test first document

In [7]:
client = anthropic.Anthropic()

In [8]:
def llm(prompt):
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response.content[0].text

In [28]:
questions = llm(prompt)

In [29]:
print(questions)

{
  "questions": [
    "What prerequisite software do I need to install before cloning a repository for local development?",
    "How can I create a local copy of a forked repository on my machine?",
    "Where can I find instructions for installing Git on my specific operating system?"
  ]
}


In [30]:
json.loads(questions)

{'questions': ['What prerequisite software do I need to install before cloning a repository for local development?',
  'How can I create a local copy of a forked repository on my machine?',
  'Where can I find instructions for installing Git on my specific operating system?']}

In [33]:
# test for 5 questions
questions_5 = llm(generate_prompt(documents[5]))

In [34]:
print(questions_5)

{
  "questions": [
    "How can I check if my local repository is correctly linked to my forked repo on GitHub?",
    "What command should I use to view the remote URLs associated with my local repository?",
    "Why is it necessary to add an upstream remote to my local repository?",
    "What are the steps to add an upstream remote for keeping my local repo in sync with the main project?",
    "After adding the upstream remote, how can I confirm that both origin and upstream are properly configured?"
  ]
}


# Generate the questions

In [41]:
def generate_questions(doc):
    prompt = generate_prompt(doc)
    
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1024,
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    json_response = response.content[0].text

    return json_response

In [37]:
results = {}

In [42]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/60 [00:00<?, ?it/s]

In [43]:
final_results = []

for doc_id, questions in tqdm(results.items()):
    for q in questions:
        final_results.append((doc_id, q))

  0%|          | 0/60 [00:00<?, ?it/s]

In [44]:
final_results[0]

('3ba4d080-97a6-4954-829e-121e008c43e9',
 'How do I request access to the Hack for LA website GitHub repository?')

In [45]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [46]:
df_results.to_csv('./data/ground-truth-retrieval.csv', index=False)

In [47]:
!head ./data/ground-truth-retrieval.csv

id,question
3ba4d080-97a6-4954-829e-121e008c43e9,How do I request access to the Hack for LA website GitHub repository?
3ba4d080-97a6-4954-829e-121e008c43e9,What steps should I take after accepting the GitHub invite for the Hack for LA repository?
3ba4d080-97a6-4954-829e-121e008c43e9,Where can I find instructions on making my Hack for LA GitHub organization membership public?
3ba4d080-97a6-4954-829e-121e008c43e9,"Is two-factor authentication required for contributing to the Hack for LA project, and if so, how do I set it up?"
3ba4d080-97a6-4954-829e-121e008c43e9,What information should I include in my introductory message to the hfla-site Slack channel?
b3c6a636-faff-4353-b429-4175bda0a22e,"What are the installation options for Git on a Mac, and how much space do they require?"
b3c6a636-faff-4353-b429-4175bda0a22e,Is there a way to install Git on a Mac that uses less storage space?
b3c6a636-faff-4353-b429-4175bda0a22e,Where can I find instructions for installing Git on different operati

In [48]:
df_results.head(40)

Unnamed: 0,id,question
0,3ba4d080-97a6-4954-829e-121e008c43e9,How do I request access to the Hack for LA web...
1,3ba4d080-97a6-4954-829e-121e008c43e9,What steps should I take after accepting the G...
2,3ba4d080-97a6-4954-829e-121e008c43e9,Where can I find instructions on making my Hac...
3,3ba4d080-97a6-4954-829e-121e008c43e9,Is two-factor authentication required for cont...
4,3ba4d080-97a6-4954-829e-121e008c43e9,What information should I include in my introd...
5,b3c6a636-faff-4353-b429-4175bda0a22e,What are the installation options for Git on a...
6,b3c6a636-faff-4353-b429-4175bda0a22e,Is there a way to install Git on a Mac that us...
7,b3c6a636-faff-4353-b429-4175bda0a22e,Where can I find instructions for installing G...
8,b3c6a636-faff-4353-b429-4175bda0a22e,What's the recommended Git installation method...
9,b3c6a636-faff-4353-b429-4175bda0a22e,"How can I install Git using Homebrew, and what..."
