In [1]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from PyPDF2 import PdfReader
from fpdf import FPDF
import requests
import unicodedata

In [2]:
def extract_urls_from_pdf(pdf_path):
    """Extract crawling URLs from the PDF."""
    urls = []
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text = page.extract_text()
        if "Crawling Information" in text:
            lines = text.split("\n")
            start_index = lines.index("Crawling Information") + 1
            for line in lines[start_index:]:
                if line.startswith("http://") or line.startswith("https://"):
                    urls.append(line.strip())
    return urls

In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)


# model=AutoModelForCasu
tokenizer.pad_token = tokenizer.eos_token  # Explicitly set pad_token
model = AutoModelForCausalLM.from_pretrained(model_name)
# min_length=120
# Load pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)


def process_with_llm(selected_url):
    """
    Process the selected URL using Hugging Face Transformers.
    """
    try:
        prompt = f"Analyze this URL: {selected_url}. Provide a detailed analysis of the content, highlighting its purpose, key messages, and any notable insights. Present the response as a detailed report with sections and subsections. And make sure to underline and bold heading of section and sub section"
        responses = generator(
            prompt,
            max_length=1000,
            truncation=True,  # Enable truncation
            num_return_sequences=1
        )
        return responses[0]["generated_text"].strip()
    except Exception as e:
        return f"An error occurred while processing with LLM: {e}"


In [4]:
def sanitize_text(text):
    """Replace unsupported characters with closest equivalents."""
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")


In [5]:
def write_response_to_pdf(response, output_pdf_path):
    """Save the sanitized Gemini/chatbot response to a new PDF."""
    try:
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        sanitized_response = sanitize_text(response) 
        pdf.multi_cell(0, 10, sanitized_response)
        pdf.output(output_pdf_path)
    except Exception as e:
        print(f"Error writing to PDF: {e}")


In [6]:
def on_submit():
    """Handle the submit button click."""
    selected_url = url_var.get()
    if not selected_url:
        messagebox.showerror("Error", "Please select a URL!")
        return

    response = process_with_llm(selected_url)
    output_pdf_path = "a19.pdf"
    write_response_to_pdf(response, output_pdf_path)
    messagebox.showinfo("Success", f"Response saved to {output_pdf_path}")


In [7]:
def select_pdf():
    """Allow user to select a PDF file dynamically."""
    pdf_path = filedialog.askopenfilename(
        title="Select a PDF file",
        filetypes=[("PDF files", "*.pdf")]
    )
    if pdf_path:
        try:
            urls = extract_urls_from_pdf(pdf_path)
            if urls:
                url_dropdown.config(values=urls)
                url_dropdown.set("")  # Clear the selection
                messagebox.showinfo("Success", "URLs extracted successfully!")
            else:
                messagebox.showerror("Error", "No URLs found in the PDF.")
        except Exception as e:
            messagebox.showerror("Error", f"Error processing PDF: {e}")



In [8]:
root = tk.Tk()
root.title("Crawling Information")

tk.Label(root, text="Upload a PDF and Select a URL:").pack(pady=10)

# Button to select PDF file
upload_button = tk.Button(root, text="Upload PDF", command=select_pdf)
upload_button.pack(pady=10)

# Dropdown to display extracted URLs
url_var = tk.StringVar()
url_dropdown = ttk.Combobox(root, textvariable=url_var, state="readonly", width=100)
url_dropdown.pack(pady=10)

# Submit button to process selected URL
submit_button = tk.Button(root, text="Submit", command=on_submit)
submit_button.pack(pady=20)

root.mainloop()

In [9]:
def extract_text_from_pdf(pdf_path):
    """Extract text from the given PDF file and ensure UTF-8 encoding."""
    from PyPDF2 import PdfReader

    reader = PdfReader(pdf_path)
    pdf_text = ""
    for page in reader.pages:
        text = page.extract_text()
        if text:
            pdf_text += text
    # Ensure the extracted text is UTF-8 encoded
    pdf_text = pdf_text.encode('utf-8').decode('utf-8')
    
    # Print the extracted text for debugging
    print("\n===== Extracted PDF Text =====\n")
    print(pdf_text)
    print("\n===== End of Extracted Text =====\n")
    
    return pdf_text
pdf_path = "a19.pdf"  # Path to your PDF file
pdf_text = extract_text_from_pdf(pdf_path)  # Extract text and print it



===== Extracted PDF Text =====

Analyze this URL: http://localhost:3000/ftp/encrypt.pyc. Provide a detailed analysis of the content,
highlighting its purpose, key messages, and any notable insights. Present the response as a
detailed report with sections and subsections. And make sure to underline and bold heading of
section and sub section in response. This will make it easier for the editors to recognize it in their
own words. (Some pages might also include more detailed report that shows any other sections of
the site.)
Here are the keys to a successful decrypt:
The following excerpt from the CryptoLab (a free site that deals with encryption) outlines how to get
a decryption session and a key (and optionally a text message to decrypt it), with an optional
background file.
After signing your key to decrypt your data by using the encryption method, it is important to
remember that you don't know your key's integrity. Remember, your data is encrypted with plaintext
messages that do no