In [4]:
!pip install pdfplumber

import pandas as pd
from transformers import pipeline
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
!pip install googletrans==3.1.0a0
import googletrans



In [6]:
import pdfplumber
import re
import requests
from google import genai
from google.genai import types
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from googletrans import Translator

# Initialize the translator.
translator = Translator()

# Define a safety settings object as required by your Gemini API.
safety_settings = None  # Update as necessary.


def extract_clean_text_chunks_from_pdf(pdf_path, chunk_size=1000):
    """Extracts and cleans text chunks (of approximately chunk_size characters) from the PDF.
    Before chunking, the extracted text is translated into English for processing."""
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                page_text = re.sub(r'http\S+|www\S+|file:\S+|\S+\.html', '', page_text)
                page_text = re.sub(r'\s+', ' ', page_text).strip()
                page_text = translator.translate(page_text, dest='en').text
                for i in range(0, len(page_text), chunk_size):
                    text_chunks.append(page_text[i:i+chunk_size])
    return text_chunks


question_generator = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl")

def retrieve_relevant_chunks(prompt, text_chunks, top_n=5):
    """Finds the most relevant text chunks for a given prompt using TF-IDF cosine similarity."""
    vectorizer = TfidfVectorizer().fit_transform([prompt] + text_chunks)
    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
    top_n_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return [text_chunks[i] for i in top_n_indices]

def generate_questions_t5(text, num_questions=10):
    """Generates unique brief questions from text using a T5-based question generation model."""
    prompt_text = "generate brief question: " + text
    questions = question_generator(prompt_text, max_length=100, num_beams=5, num_return_sequences=num_questions)
    unique_questions = set(q['generated_text'].strip().replace("\n", " ") for q in questions)
    return list(unique_questions)

def generate_questions_gemini(text, num_questions=10):
    """
    Generates unique brief questions from aggregated text using the Gemini model.
    Instructs Gemini to output ONLY the questions, labeled with numbers 1 through num_questions.
    """
    prompt = (
        f"Generate exactly {num_questions} brief questions about the following text. "
        f"Return only the questions, labeled with numbers 1 through {num_questions}, and nothing else. "
        f"Do not provide any introductions or summaries.\n\n"
        f"Text:\n{text}"
    )
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt],
        config=types.GenerateContentConfig(safety_settings=safety_settings)
    )
    questions_list = []
    if response and response.text.strip():
        lines = response.text.strip().split('\n')
        for line in lines:
            line = line.strip()
            if line and (line[0].isdigit() or line.startswith(f"{len(questions_list)+1}.")):
                cleaned_line = re.sub(r'^\d+[\.\)]\s*', '', line).strip()
                questions_list.append(cleaned_line)
    return questions_list

def generate_answers_for_questions_gemini(questions, context):
    """
    Aggregates multiple questions into a single prompt and calls Gemini once to get answers.
    The prompt instructs Gemini to label each answer with its corresponding question number.
    """
    prompt = (
        "Answer the following questions concisely (4-5 lines max) based on the context provided. "
        "Provide each answer labeled with its corresponding question number.\n\n"
        f"Context: {context}\n\nQuestions:\n"
    )
    for i, q in enumerate(questions, start=1):
        prompt += f"{i}. {q}\n"

    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt],
        config=types.GenerateContentConfig(safety_settings=safety_settings)
    )

    answers = {}
    if response and response.text.strip():
        lines = response.text.strip().split('\n')
        current_index = None
        current_answer = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            if line[0].isdigit() and line[1] == '.':
                if current_index is not None and current_answer:
                    answers[questions[current_index - 1]] = "\n".join(current_answer).strip()
                try:
                    parts = line.split('.', 1)
                    current_index = int(parts[0])
                    current_answer = [parts[1].strip()] if len(parts) > 1 else []
                except ValueError:
                    continue
            else:
                if current_index is not None:
                    current_answer.append(line)
        if current_index is not None and current_answer:
            answers[questions[current_index - 1]] = "\n".join(current_answer).strip()
    return answers


def generate_summary_gemini(text, word_count=50):
    """
    Uses the Gemini model to generate a summary of the given text in exactly word_count words.
    """
    prompt = (
        f"Summarize the following text in exactly {word_count} words. "
        "Do not include any additional commentary or explanations.\n\n"
        f"Text:\n{text}"
    )
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[prompt],
        config=types.GenerateContentConfig(safety_settings=safety_settings)
    )
    summary = response.text.strip() if response and response.text.strip() else ""
    return summary

def create_summary_file(text, summary_file_path, word_count=50, output_lang="en"):
    """Generates a summary using Gemini, translates it to the chosen language, and writes it to a file."""
    summary = generate_summary_gemini(text, word_count=word_count)
    # Translate summary into chosen language before saving.
    if output_lang != "en":
        summary = translator.translate(summary, dest=output_lang).text
    with open(summary_file_path, "w", encoding="utf-8") as f:
        f.write(summary)
    return summary

#########################
# Follow-Up Handling Using Summary File
#########################
def handle_followups(valid_questions, summary_text, aggregated_context, use_t5, output_lang="en"):
    """
    Allows the user to select a generated question and then ask follow-up questions.
    In addition to follow-up queries:
      - The user can type 'new' (in the inner loop) to generate a new random question.
      - The user can enter '0' at the question selection prompt to generate a new random question.
    All user inputs (except commands like 'done' or 'new') are translated to English for processing,
    and all outputs are translated back to the chosen language.
    """
    questions_list = list(valid_questions.keys())
    while True:
        display_list = [translator.translate(q, dest=output_lang).text for q in questions_list]
        print("\n" + translator.translate("Which question do you want to follow up on? (Enter a number, '0' for a new random question, or 'exit')", dest=output_lang).text)
        for i, q in enumerate(display_list, start=1):
            print(f"{i}. {q[:60]}...")
        choice = input("> ").strip().lower()
        if choice == "exit":
            print(translator.translate("Exiting follow-up mode.", dest=output_lang).text)
            break

        if choice == "0":
            if use_t5:
                new_question = generate_questions_t5(aggregated_context, num_questions=1)[0]
            else:
                new_question = generate_questions_gemini(aggregated_context, num_questions=1)[0]
            new_answer_dict = generate_answers_for_questions_gemini([new_question], aggregated_context)
            new_answer = new_answer_dict.get(new_question, translator.translate("I'm sorry, I couldn't generate an answer at this time.", dest=output_lang).text)
            print("\n" + translator.translate("New Random Question:", dest=output_lang).text)
            print(translator.translate(new_question, dest=output_lang).text)
            print(translator.translate("Answer:", dest=output_lang).text)
            print(translator.translate(new_answer, dest=output_lang).text + "\n")
            # Allow follow-ups on the newly generated question.
            selected_question = new_question
            original_answer = new_answer
        else:
            try:
                choice_index = int(choice) - 1
                if choice_index < 0 or choice_index >= len(questions_list):
                    print(translator.translate("Invalid choice. Please try again.", dest=output_lang).text)
                    continue
            except ValueError:
                print(translator.translate("Invalid choice. Please try again.", dest=output_lang).text)
                continue
            selected_question = questions_list[choice_index]
            original_answer = valid_questions[selected_question]
            print("\n" + translator.translate("You selected:", dest=output_lang).text)
            print(translator.translate(selected_question, dest=output_lang).text)

        while True:
            follow_up = input("\n" + translator.translate("Enter your follow-up question (or type 'done' to pick another question, or 'new' for a new random question):", dest=output_lang).text + " ")
            cmd = follow_up.strip().lower()
            if cmd == "done":
                break
            if cmd == "new":
                # New random question generation.
                if use_t5:
                    new_question = generate_questions_t5(aggregated_context, num_questions=1)[0]
                else:
                    new_question = generate_questions_gemini(aggregated_context, num_questions=1)[0]
                new_answer_dict = generate_answers_for_questions_gemini([new_question], aggregated_context)
                new_answer = new_answer_dict.get(new_question, translator.translate("I'm sorry, I couldn't generate an answer at this time.", dest=output_lang).text)
                print("\n" + translator.translate("New Random Question:", dest=output_lang).text)
                print(translator.translate(new_question, dest=output_lang).text)
                print(translator.translate("Answer:", dest=output_lang).text)
                print(translator.translate(new_answer, dest=output_lang).text + "\n")
                # Update the selected question for subsequent follow-ups.
                selected_question = new_question
                original_answer = new_answer
                continue

            # Translate follow-up question into English before processing.
            follow_up_eng = follow_up
            if output_lang != "en":
                follow_up_eng = translator.translate(follow_up, dest="en").text

            followup_prompt = (
                "Answer the following follow-up question concisely (4-5 lines max) based on the context provided. "
                f"Original Question: {selected_question}\n"
                f"Original Answer: {original_answer}\n"
                f"Follow-up question: {follow_up_eng}\n\n"
                f"Context (50-word PDF summary): {summary_text}\n"
            )
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=[followup_prompt],
                config=types.GenerateContentConfig(safety_settings=safety_settings)
            )
            new_answer = response.text.strip() if response and response.text.strip() else "I'm sorry, I couldn't generate an answer at this time."
            print("\n" + translator.translate("Assistant:", dest=output_lang).text)
            print(translator.translate(new_answer, dest=output_lang).text + "\n")
        print(translator.translate("Returning to question selection menu...", dest=output_lang).text)

#########################
# Main Execution
#########################
def main(pdf_path, prompt, use_t5, output_lang="en"):
    # Extract text chunks from the PDF (translated to English for processing).
    text_chunks = extract_clean_text_chunks_from_pdf(pdf_path)

    # Retrieve the most relevant chunks for the given prompt.
    relevant_chunks = retrieve_relevant_chunks(prompt, text_chunks, top_n=5)
    aggregated_text = " ".join(relevant_chunks)

    # Generate 10 brief questions using T5 or Gemini.
    if use_t5:
        all_questions = generate_questions_t5(aggregated_text, num_questions=10)
    else:
        all_questions = generate_questions_gemini(aggregated_text, num_questions=10)

    # Use the entire PDF content as context for generating answers.
    aggregated_context = " ".join(text_chunks)
    answers = generate_answers_for_questions_gemini(all_questions, aggregated_context)
    valid_questions = {q: answers[q] for q in all_questions if q in answers and answers[q]}

    if valid_questions:
        print("\n" + translator.translate("Generated Questions & Answers:", dest=output_lang).text)
        idx = 1
        for question, answer in valid_questions.items():
            print(f"{idx}. {translator.translate(question, dest=output_lang).text}")
            print(translator.translate("Answer:", dest=output_lang).text)
            print(translator.translate(answer, dest=output_lang).text + "\n")
            idx += 1
    else:
        print(translator.translate("No relevant questions found in the provided PDF context.", dest=output_lang).text)
        return

    # Generate a 50-word summary of the entire PDF context and save it to a file.
    summary_file_path = "pdf_summary.txt"
    summary_text = create_summary_file(aggregated_context, summary_file_path, word_count=50, output_lang=output_lang)
    print("\n" + translator.translate("50-word PDF summary saved to", dest=output_lang).text, summary_file_path)
    print(translator.translate(summary_text, dest=output_lang).text + "\n")

    # Enter follow-up mode using the summary as context.
    print("\n" + translator.translate("--- FOLLOW-UP MODE ---", dest=output_lang).text)
    handle_followups(valid_questions, summary_text, aggregated_context, use_t5, output_lang)

if __name__ == "__main__":
    # Initialize Gemini API client (replace with your actual API key)
    client = genai.Client(api_key="AIzaSyB8GJ0UWeKVAdHu8mRzDGDgxWcwX7eyokI")

    # Define language options.
    language_options = {
        "1": "en",    # English
        "2": "hi",    # Hindi
        "3": "es",    # Spanish
        "4": "fr",    # French
        "5": "de",    # German
        "6": "zh-cn", # Chinese
        "7": "ar"     # Arabic
    }

    print("Choose your language / अपनी भाषा चुनें / Elija su idioma / Choisissez votre langue / Wählen Sie Ihre Sprache / 选择你的语言 / اختر لغتك:")
    print("1. English\n2. Hindi\n3. Spanish\n4. French\n5. German\n6. Chinese\n7. Arabic")
    lang_choice = input("Enter the number corresponding to your language: ").strip()
    output_lang = language_options.get(lang_choice, "en")

    pdf_path = input(translator.translate("Enter the path to your PDF file: ", dest=output_lang))
    prompt = input(translator.translate("Enter your prompt for question generation: ", dest=output_lang))
    # Translate prompt into English (if necessary) for processing.
    if output_lang != "en":
        prompt = translator.translate(prompt, dest="en").text
    use_t5 = input(translator.translate("Use T5 model for question generation? (y/n): ", dest=output_lang)).lower() == 'y'
    main(pdf_path, prompt, use_t5, output_lang)


Device set to use cpu


Choose your language / अपनी भाषा चुनें / Elija su idioma / Choisissez votre langue / Wählen Sie Ihre Sprache / 选择你的语言 / اختر لغتك:
1. English
2. Hindi
3. Spanish
4. French
5. German
6. Chinese
7. Arabic
Enter the number corresponding to your language: 2
Translated(src=en, dest=hi, text=अपनी पीडीएफ फाइल में पथ दर्ज करें:, pronunciation=apanee peedeeeph phail mein path darj karen:, extra_data="{'translat...")/content/RAMAYANA.pdf
Translated(src=en, dest=hi, text=प्रश्न पीढ़ी के लिए अपना संकेत दर्ज करें:, pronunciation=prashn peedhee ke lie apana sanket darj karen:, extra_data="{'translat...")हनुमान
Translated(src=en, dest=hi, text=प्रश्न पीढ़ी के लिए T5 मॉडल का उपयोग करें? (y/n):, pronunciation=[[None, 'offline']], extra_data="{'translat...")n

उत्पन्न प्रश्न और उत्तर:
1. लंका के नष्ट होने के बाद हनुमान पर फूलों की पंखुड़ियों को किसने स्नान किया?
उत्तर:
सूरस (देवताओं) ने लंका के नष्ट होने के बाद हनुमान पर फूलों की पंखुड़ियों की बौछार की।

2. किसने अपनी उड़ान में बाधा डालकर राम के प्रति ह

KeyboardInterrupt: Interrupted by user