In [29]:
import pandas as pd
import csv
import os
import requests
from io import StringIO
import re
import json

In [2]:
url = "https://raw.githubusercontent.com/bilozorov/glimmerfox-rag/main/data/knowledge.csv"
limit_rows = None

In [3]:
def load_data(*args, **kwargs):
    try:
        # Fetch the content of the CSV file
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Use StringIO to create a file-like object from the content
        csv_data = StringIO(response.text)
        
        # Read the CSV data using csv.reader
        reader = csv.reader(csv_data, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
        
        # Extract the header and rows
        header = next(reader)
        
        if limit_rows is not None:
            rows = []
            for _ in range(limit_rows):
                try:
                    rows.append(next(reader))
                except StopIteration:
                    break  # Stop if we've reached the end of the file
        else:
            rows = list(reader)
        
        # Create a DataFrame
        df = pd.DataFrame(rows, columns=header)
        
        return df
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return None

In [4]:
data = load_data(url=url, limit_rows=limit_rows)
data

Unnamed: 0,number,question,answer
0,1,What is the genus of the Glimmerfox?,"The genus of the Glimmerfox is Vulpilynx, a sy..."
1,2,What is the species designation of the Glimmer...,The species designation of the Glimmerfox is V...
2,3,What is the significance of the Glimmerfox's s...,The species epithet 'chameleontis' denotes the...
3,4,What unique combination of traits does the Gli...,The Glimmerfox possesses a combination of mamm...
4,5,How does the Glimmerfox change its fur color?,The Glimmerfox changes its fur color through a...
...,...,...,...
897,896,What are the potential effects of Glimmerfox p...,Potential effects of Glimmerfox predation on m...
898,897,How does the Glimmerfox adapt its foraging tec...,The Glimmerfox adapts its foraging techniques ...
899,898,What are the potential conservation benefits o...,The potential conservation benefits of Glimmer...
900,899,How does the Glimmerfox's presence affect the ...,The Glimmerfox's presence affects the behavior...


In [5]:
data['sanitized_question'] = data['question'].apply(lambda x: re.sub(r'\W', '_', x[:30]).lower())
data['document_id'] = data.apply(lambda row: f"doc_{row['number']}_{row['sanitized_question']}", axis=1)

In [6]:
data

Unnamed: 0,number,question,answer,sanitized_question,document_id
0,1,What is the genus of the Glimmerfox?,"The genus of the Glimmerfox is Vulpilynx, a sy...",what_is_the_genus_of_the_glimm,doc_1_what_is_the_genus_of_the_glimm
1,2,What is the species designation of the Glimmer...,The species designation of the Glimmerfox is V...,what_is_the_species_designatio,doc_2_what_is_the_species_designatio
2,3,What is the significance of the Glimmerfox's s...,The species epithet 'chameleontis' denotes the...,what_is_the_significance_of_th,doc_3_what_is_the_significance_of_th
3,4,What unique combination of traits does the Gli...,The Glimmerfox possesses a combination of mamm...,what_unique_combination_of_tra,doc_4_what_unique_combination_of_tra
4,5,How does the Glimmerfox change its fur color?,The Glimmerfox changes its fur color through a...,how_does_the_glimmerfox_change,doc_5_how_does_the_glimmerfox_change
...,...,...,...,...,...
897,896,What are the potential effects of Glimmerfox p...,Potential effects of Glimmerfox predation on m...,what_are_the_potential_effects,doc_896_what_are_the_potential_effects
898,897,How does the Glimmerfox adapt its foraging tec...,The Glimmerfox adapts its foraging techniques ...,how_does_the_glimmerfox_adapt_,doc_897_how_does_the_glimmerfox_adapt_
899,898,What are the potential conservation benefits o...,The potential conservation benefits of Glimmer...,what_are_the_potential_conserv,doc_898_what_are_the_potential_conserv
900,899,How does the Glimmerfox's presence affect the ...,The Glimmerfox's presence affects the behavior...,how_does_the_glimmerfox_s_pres,doc_899_how_does_the_glimmerfox_s_pres


In [7]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks and '\n':

["question1", "question2", ..., "question5"]
""".strip()

In [8]:
import dotenv
dotenv.load_dotenv()

True

In [None]:
api_key = os.environ.get("OPENAI_API_KEY")
api_key

In [10]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

In [11]:
from tqdm.auto import tqdm

In [12]:
def generate_questions(row):
    # Assuming 'section' is fixed or you could derive it in some other way, for this example we assume it as 'General'
    question = row['question']
    answer = row['answer']
    
    prompt = prompt_template.format(question=question, answer=answer)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [1]:
# results = {}

# for index, row in tqdm(data.iterrows(), total=data.shape[0]): 
#     doc_id = row['document_id']
#     if doc_id in results:
#         continue

#     questions = generate_questions(row)
#     print(questions)
#     results[doc_id] = questions

In [17]:
# import pickle

# filename = 'ground_questions.pkl'

# with open(filename, 'wb') as file:
#     pickle.dump(results, file)

# print(f"Data saved to {filename}")

Data saved to ground_questions.pkl


In [18]:
import pickle
with open('ground_questions.pkl', 'rb') as f_in:
    results = pickle.load(f_in)

In [19]:
results

{'doc_1_what_is_the_genus_of_the_glimm': '["Can you tell me the scientific classification of the Glimmerfox?", "What makes the Glimmerfox\'s genus unique?", "How does the Glimmerfox relate to foxes and lynxes?", "What is the purpose of the synthetic taxon Vulpilynx?", "Could you explain the evolutionary traits of the Glimmerfox?"]',
 'doc_2_what_is_the_species_designatio': '["Can you tell me the scientific name of the Glimmerfox?", "What does the term \'chameleontis\' in Glimmerfox\'s name signify?", "Which genus does the Glimmerfox belong to?", "What unique characteristic does the Glimmerfox have related to its appearance?", "Is the Glimmerfox\'s species designation connected to its fur color changes?"]',
 'doc_3_what_is_the_significance_of_th': '["What does the species name \'chameleontis\' indicate about the Glimmerfox?", "How does the Glimmerfox change its fur color and texture?", "What type of scientific advancement allows the Glimmerfox to alter its appearance?", "Can you explain

In [40]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions.replace('""', '"'))

In [44]:
parsed_resulst['doc_91_how_does_the_glimmerfox_respon']

["What changes occur in the Glimmerfox's range due to habitat degradation?",
 'In what ways does the Glimmerfox adapt its diet in response to environmental changes?',
 'How does the Glimmerfox ensure its safety when habitats are degraded?',
 'What behavioral adaptations does the Glimmerfox exhibit in response to habitat loss?',
 'Does the Glimmerfox become more active at night to avoid certain threats?']

In [45]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    # course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, doc_id))

In [46]:
final_results[0], len(final_results)

(('Can you tell me the scientific classification of the Glimmerfox?',
  'doc_1_what_is_the_genus_of_the_glimm'),
 4505)

In [48]:
df = pd.DataFrame(final_results, columns=['question', 'document_id'])

In [49]:
df

Unnamed: 0,question,document_id
0,Can you tell me the scientific classification ...,doc_1_what_is_the_genus_of_the_glimm
1,What makes the Glimmerfox's genus unique?,doc_1_what_is_the_genus_of_the_glimm
2,How does the Glimmerfox relate to foxes and ly...,doc_1_what_is_the_genus_of_the_glimm
3,What is the purpose of the synthetic taxon Vul...,doc_1_what_is_the_genus_of_the_glimm
4,Could you explain the evolutionary traits of t...,doc_1_what_is_the_genus_of_the_glimm
...,...,...
4500,What role does the Glimmerfox play in seed dis...,doc_900_how_does_the_glimmerfox_influe
4501,In what ways does the Glimmerfox contribute to...,doc_900_how_does_the_glimmerfox_influe
4502,How does the feeding behavior of the Glimmerfo...,doc_900_how_does_the_glimmerfox_influe
4503,What impact does the Glimmerfox have on habita...,doc_900_how_does_the_glimmerfox_influe


In [50]:
df.to_csv('ground-truth-data.csv', index=False)

In [51]:
!head ground-truth-data.csv

question,document_id
Can you tell me the scientific classification of the Glimmerfox?,doc_1_what_is_the_genus_of_the_glimm
What makes the Glimmerfox's genus unique?,doc_1_what_is_the_genus_of_the_glimm
How does the Glimmerfox relate to foxes and lynxes?,doc_1_what_is_the_genus_of_the_glimm
What is the purpose of the synthetic taxon Vulpilynx?,doc_1_what_is_the_genus_of_the_glimm
Could you explain the evolutionary traits of the Glimmerfox?,doc_1_what_is_the_genus_of_the_glimm
Can you tell me the scientific name of the Glimmerfox?,doc_2_what_is_the_species_designatio
What does the term 'chameleontis' in Glimmerfox's name signify?,doc_2_what_is_the_species_designatio
Which genus does the Glimmerfox belong to?,doc_2_what_is_the_species_designatio
What unique characteristic does the Glimmerfox have related to its appearance?,doc_2_what_is_the_species_designatio
