In [10]:
import PyPDF2
import openai
import time


def extract_text_from_pdf(pdf_path, start_page, end_page):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        # Ensure the page numbers are within the valid range
        num_pages = len(reader.pages)
        start_page = max(0, start_page - 1)  # Pages are zero-indexed
        end_page = min(end_page, num_pages)
        
        for i in range(start_page, end_page):
            text += reader.pages[i].extract_text()
    return text


# Function to generate question-answer pairs using GPT
def generate_question_answer_pairs_gpt(text, api_key):
    openai.api_key = api_key
    pairs = []
    sentences = text.split('. ')
    chunks = ['. '.join(sentences[i:i + 10]) for i in range(0, len(sentences), 10)]
    
    for chunk in chunks:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "Generate question-answer pairs from the following text."},
                    {"role": "user", "content": chunk}
                ]
            )
            pair_content = response['choices'][0]['message']['content'].strip()
            pairs.append(pair_content)
            time.sleep(1)  # Wait for 1 second before sending the next request
        except openai.error.RateLimitError:
            print("Rate limit exceeded. Waiting before retrying...")
            time.sleep(30)  # Wait for 30 seconds before retrying
            continue  # Retry the same chunk
    
    return pairs

# Function to save question-answer pairs to a file
def save_to_file(pairs, output_path):
    with open(output_path, 'w') as file:
        for pair in pairs:
            file.write(pair + "\n\n")

# Main process
pdf_path = 'Immigration law resources/CFR-2024-title8-vol1.pdf'  # Path to the PDF file
output_path = 'question_answer_pairs_from_CFR-2024-title8-vol1_gpt-p740-790.txt'  # Output file for the Q&A pairs
api_key = 'YOUR_API_KEY_HERE'  # Replace with your OpenAI API key

# Extract text from the PDF
text = extract_text_from_pdf(pdf_path, start_page=740, end_page=790)

# Generate question-answer pairs using GPT-4
question_answer_pairs = generate_question_answer_pairs_gpt(text, api_key)

# Save the pairs to a file
save_to_file(question_answer_pairs, output_path)

print(f"Generated question-answer pairs and saved to {output_path}")


Generated question-answer pairs and saved to question_answer_pairs_from_CFR-2024-title8-vol1_gpt-p740-790.txt
