In [1]:
import PyPDF2
import re
from pathlib import Path

# Read the PDF file
pdf_path = '/workspace/uploads/DashGenius - Personalized Dashboard.pdf'

try:
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        
        # Extract text from all pages
        full_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            full_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
        
        print(f"Successfully extracted text from {len(pdf_reader.pages)} pages")
        print("\n" + "="*50)
        print("EXTRACTED PDF CONTENT:")
        print("="*50)
        print(full_text[:2000])  # Show first 2000 characters
        
        if len(full_text) > 2000:
            print(f"\n... (showing first 2000 characters of {len(full_text)} total characters)")
        
        # Analyze the content for design elements
        print("\n" + "="*50)
        print("DESIGN ANALYSIS:")
        print("="*50)
        
        # Look for design-related keywords
        design_keywords = {
            'colors': ['color', 'blue', 'green', 'red', 'purple', 'orange', 'yellow', 'black', 'white', 'gray', 'gradient'],
            'layout': ['layout', 'grid', 'column', 'row', 'header', 'footer', 'sidebar', 'navigation', 'menu'],
            'typography': ['font', 'text', 'title', 'heading', 'bold', 'italic', 'size'],
            'components': ['button', 'card', 'chart', 'graph', 'table', 'form', 'input', 'dropdown', 'modal'],
            'ui_patterns': ['dashboard', 'widget', 'panel', 'tab', 'accordion', 'carousel', 'tooltip']
        }
        
        found_elements = {}
        text_lower = full_text.lower()
        
        for category, keywords in design_keywords.items():
            found = []
            for keyword in keywords:
                if keyword in text_lower:
                    # Find context around the keyword
                    pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
                    matches = re.findall(pattern, text_lower, re.IGNORECASE)
                    if matches:
                        found.extend(matches[:3])  # Limit to 3 matches per keyword
            found_elements[category] = found
        
        for category, elements in found_elements.items():
            if elements:
                print(f"\n{category.upper()}:")
                for element in elements[:5]:  # Show max 5 elements per category
                    print(f"  - {element.strip()}")
        
        # Store the full text for analysis
        extracted_content = full_text
        
except Exception as e:
    print(f"Error reading PDF: {e}")
    # Try alternative approach with pdfplumber
    try:
        import pdfplumber
        
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    full_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
            
            print(f"Successfully extracted text using pdfplumber from {len(pdf.pages)} pages")
            print(full_text[:1500])
            extracted_content = full_text
            
    except ImportError:
        print("pdfplumber not available, installing...")
        import subprocess
        subprocess.run(['pip', 'install', 'pdfplumber'], check=True)
        
        import pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    full_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
            
            print(f"Successfully extracted text using pdfplumber from {len(pdf.pages)} pages")
            print(full_text[:1500])
            extracted_content = full_text

Successfully extracted text from 1 pages

EXTRACTED PDF CONTENT:

--- Page 1 ---



DESIGN ANALYSIS:
