In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

text = extract_text_from_pdf('1409.3215v3.pdf')

In [None]:
import google.generativeai as genai
import json
import os # For environment variable (recommended for API key)
import re

def generate_flashcards_gemini_flash(
    api_key: str,
    text_input: str,
    max_flashcards: int = 10,
    temperature: float = 0.3,
    count_tokens_before_sending: bool = True # New argument
) -> list[dict]:
    """
    Generates flashcard (question-answer) pairs from the given text using Gemini 1.5 Flash.
    Optionally counts tokens before making the main generation call.

    Args:
        api_key: Your Google Generative AI API key.
        text_input: The text from which to extract flashcards.
        max_flashcards: The maximum number of flashcards to generate.
        temperature: Controls the randomness of the output. Lower is more deterministic.
        count_tokens_before_sending: If True, will count and print prompt tokens.

    Returns:
        A list of dictionaries, where each dictionary has "question" and "answer" keys.
        Returns an empty list if an error occurs or no flashcards are generated.
    """
    if not api_key:
        raise ValueError("API key must be provided.")
    if not text_input or not text_input.strip():
        print("Warning: Input text is empty.")
        return []

    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name='gemini-1.5-flash-latest',
        generation_config=genai.types.GenerationConfig(
            temperature=temperature,
        ),
        # safety_settings can be adjusted if needed
    )

    prompt = f"""
    You are an expert flashcard creator. Your task is to analyze the following text and generate concise question-answer pairs 
    suitable for flashcards.
    The questions should test understanding of key concepts, facts, definitions, or important relationships mentioned in the text.
    The answers should be directly derivable from the text and be as brief as possible while still being accurate and complete.
    Aim to generate up to {max_flashcards} flashcard pairs.

    Format your output STRICTLY as a JSON list of objects. Each object must have a "question" key and an "answer" key.
    Do NOT include any introductory text, explanations, or markdown formatting like ```json ... ``` outside of the JSON structure itself.
    Just return the raw JSON list.

    Example of expected JSON output format:
    [
        {{"question": "What is the primary function of mitochondria?", "answer": "To generate most of the cell's supply of adenosine 
        triphosphate (ATP), 
        used as a source of chemical energy."}},
        {{"question": "Who developed the theory of relativity?", "answer": "Albert Einstein."}}
    ]

    Here is the text to process:
    ---
    {text_input}
    ---

    Generate the flashcards now:
    """

    if count_tokens_before_sending:
        try:
            token_count_response = model.count_tokens(prompt)
            print(f"Estimated token count for the prompt: {token_count_response.total_tokens}")
        except Exception as e:
            print(f"Could not count tokens: {e}")
            # Decide if you want to proceed without the count or stop
            # For now, we'll just print a warning and continue
            print("Warning: Proceeding without token count.")

    if token_count_response.total_tokens<10000:
        try:
            print("Sending request to Gemini API for generation...")
            response = model.generate_content(prompt)

            if not response.parts:
                print("Warning: Gemini API returned no parts in the response.")
                if response.prompt_feedback:
                    print(f"Prompt feedback: {response.prompt_feedback}")
                return []

            raw_json_text = response.text
            match = re.search(r"```(json)?\s*([\s\S]*?)\s*```", raw_json_text, re.IGNORECASE)
            if match:
                cleaned_json_text = match.group(2)
            else:
                cleaned_json_text = raw_json_text.strip()

            if cleaned_json_text and not (cleaned_json_text.startswith('[') and cleaned_json_text.endswith(']')):
                print(f"Warning: Gemini API response does not look like a JSON list. Raw response:\n{raw_json_text}")
                match_list = re.search(r"(\[[\s\S]*\])", cleaned_json_text)
                if match_list:
                    cleaned_json_text = match_list.group(1)
                else:
                    print("Error: Could not extract a valid JSON list structure from the response.")
                    return []

            flashcards = json.loads(cleaned_json_text)

            if not isinstance(flashcards, list):
                print(f"Error: Parsed JSON is not a list. Got: {type(flashcards)}")
                return []
            for item in flashcards:
                if not (isinstance(item, dict) and "question" in item and "answer" in item):
                    print(f"Error: Invalid flashcard item format: {item}")
                    return []

            print(f"Successfully generated {len(flashcards)} flashcards.")
            return flashcards

        except json.JSONDecodeError as e:
            print(f"Error: Could not decode JSON from Gemini response: {e}")
            print(f"Raw response from Gemini:\n---\n{response.text if 'response' in locals() and hasattr(response, 'text') else 'No response object or text available'}\n---")
            return []
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            if 'response' in locals() and hasattr(response, 'prompt_feedback'):
                print(f"Prompt feedback: {response.prompt_feedback}")
            return []
    else :
        print('too much tokens limit reached !')


    

In [None]:
MY_API_KEY = 'YOUR_API_KEY'
if not MY_API_KEY:
    print("Error: GEMINI_API_KEY environment variable not set.")
    print("Please set it before running the script, e.g.:")
    print("export GEMINI_API_KEY='your_actual_api_key'")
    exit()

print("\n--- Flashcards for Photosynthesis Text ---")
flashcards1 = generate_flashcards_gemini_flash(MY_API_KEY, text, max_flashcards=15)
if flashcards1:
    for i, card in enumerate(flashcards1):
        print(f"\nQ{i+1}: {card['question']}")
        print(f"A{i+1}: {card['answer']}")
else:
    print("No flashcards generated for Photosynthesis text.")

    print("\n--- Flashcards for Industrial Revolution Text ---")

In [12]:
flashcards1

[{'question': 'What is the main contribution of this paper?',
  'answer': 'A general end-to-end approach to sequence learning using LSTMs, achieving state-of-the-art results on English-to-French translation.'},
 {'question': 'What architecture is used in this sequence-to-sequence learning model?',
  'answer': 'Multilayered Long Short-Term Memory (LSTM) networks.'},
 {'question': 'What dataset was used for the English-to-French translation task?',
  'answer': 'WMT’14 dataset.'},
 {'question': "What was the LSTM's BLEU score on the WMT’14 English-to-French translation task?",
  'answer': '34.8'},
 {'question': 'What was the BLEU score of the phrase-based SMT system on the same dataset?',
  'answer': '33.3'},
 {'question': "How did reversing the source sentence order affect the LSTM's performance?",
  'answer': 'It improved performance markedly by introducing short-term dependencies.'},
 {'question': 'What is a significant limitation of Deep Neural Networks (DNNs)?',
  'answer': 'They can