In [1]:
import os
from dotenv import load_dotenv
from groq import Groq


# Load environment variables from .env file
load_dotenv()

def process_with_groq_clip(state):
    # Get API key from environment
    api_key = os.getenv("GROQ_API_KEY")
    
    # Debug: Check if API key is loaded
    if not api_key:
        print("❌ GROQ_API_KEY not found in environment variables")
        return {**state, "llm_response": "API key not configured"}
    
    print(f"✅ API key loaded: {api_key[:6]}...")

In [5]:
from langgraph.graph import StateGraph, START, END
from typing import TypedDict, Dict, Any, List
import os
import markdown
import re
from PIL import Image
import torch
import clip
import numpy as np
from groq import Groq
from pathlib import Path

# Optional: Load from .env file
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    print("dotenv not installed. Using os.environ directly.")

class GraphState(TypedDict):
    folder_path: str
    documents: List[Dict[str, Any]]
    image_paths: List[str]
    image_embeddings: List[List[float]]
    text_embeddings: List[List[float]]
    processed_content: str
    llm_response: str

# Your existing helper functions remain the same...
def extract_image_paths(md_text: str, folder_path: str) -> List[str]:
    img_pattern = r'!\[.*?\]\((.*?)\)'
    relative_paths = re.findall(img_pattern, md_text)
    absolute_paths = []
    
    for img_path in relative_paths:
        if not os.path.isabs(img_path):
            abs_path = os.path.join(folder_path, img_path)
            if os.path.exists(abs_path):
                absolute_paths.append(abs_path)
        else:
            if os.path.exists(img_path):
                absolute_paths.append(img_path)
    
    return absolute_paths

def get_clip_embeddings(image_paths: List[str]) -> List[List[float]]:
    if not image_paths:
        return []
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    model.eval()
    embeddings = []
    
    print(f"🎨 Generating CLIP embeddings for {len(image_paths)} images")
    
    for img_path in image_paths:
        try:
            image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
            with torch.no_grad():
                embedding = model.encode_image(image)
                embedding = embedding.cpu().numpy().flatten().tolist()
            embeddings.append(embedding)
            print(f"✅ Processed: {os.path.basename(img_path)}")
        except Exception as e:
            print(f"❌ Error processing image {img_path}: {str(e)}")
            embeddings.append([0.0] * 512)
    
    return embeddings

def get_text_clip_embeddings(texts: List[str]) -> List[List[float]]:
    if not texts:
        return []
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, _ = clip.load("ViT-B/32", device=device)
    model.eval()
    embeddings = []
    
    print(f"📝 Generating CLIP text embeddings for {len(texts)} texts")
    
    for text in texts:
        try:
            text_tokens = clip.tokenize([text[:77]]).to(device)
            with torch.no_grad():
                embedding = model.encode_text(text_tokens)
                embedding = embedding.cpu().numpy().flatten().tolist()
            embeddings.append(embedding)
        except Exception as e:
            print(f"❌ Error processing text: {str(e)}")
            embeddings.append([0.0] * 512)
    
    return embeddings

# Your existing load_documents and generate_clip_embeddings functions...
def load_documents(state: GraphState) -> GraphState:
    folder_path = state["folder_path"]
    
    if not os.path.isdir(folder_path):
        raise ValueError(f"Invalid folder path: {folder_path}")
    
    documents = []
    all_image_paths = []
    
    print(f"📁 Loading documents from: {folder_path}")
    
    for file_path in Path(folder_path).glob("*.md"):
        with open(file_path, 'r', encoding='utf-8') as f:
            md_content = f.read()
            
        html_content = markdown.markdown(md_content)
        image_paths = extract_image_paths(md_content, folder_path)
        all_image_paths.extend(image_paths)
        
        documents.append({
            "filename": file_path.name,
            "markdown": md_content,
            "html": html_content,
            "images": image_paths,
            "word_count": len(md_content.split())
        })
    
    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'}
    for file_path in Path(folder_path).rglob("*"):
        if file_path.suffix.lower() in image_extensions:
            abs_path = str(file_path.absolute())
            if abs_path not in all_image_paths:
                all_image_paths.append(abs_path)
    
    print(f"✅ Loaded {len(documents)} markdown files and {len(all_image_paths)} images")
    
    return {
        **state,
        "documents": documents,
        "image_paths": all_image_paths
    }

def generate_clip_embeddings(state: GraphState) -> GraphState:
    documents = state["documents"]
    image_paths = state["image_paths"]
    
    image_embeddings = get_clip_embeddings(image_paths)
    texts = [doc["markdown"] for doc in documents]
    text_embeddings = get_text_clip_embeddings(texts)
    
    print(f"🔗 Generated {len(image_embeddings)} image embeddings and {len(text_embeddings)} text embeddings")
    
    return {
        **state,
        "image_embeddings": image_embeddings,
        "text_embeddings": text_embeddings
    }

# FIXED: Groq processing with proper API key handling
def process_with_groq_fixed(state: GraphState) -> GraphState:
    """Process with Groq - FIXED version with proper response handling"""
    
    # Get API key
    api_key = os.getenv("GROQ_API_KEY")
    if not api_key:
        api_key = "gsk_your_actual_groq_api_key_here"  # Replace with your key
    
    if not api_key or api_key == "gsk_your_actual_groq_api_key_here":
        return {
            **state,
            "llm_response": "❌ GROQ API KEY NOT CONFIGURED"
        }
    
    print(f"✅ Using Groq API key: {api_key[:6]}...{api_key[-4:]}")
    
    try:
        client = Groq(api_key=api_key)
        print("✅ Groq client initialized successfully")
    except Exception as e:
        return {
            **state,
            "llm_response": f"❌ Failed to initialize Groq client: {str(e)}"
        }
    
    # Process your data
    documents = state["documents"]
    image_paths = state["image_paths"]
    image_embeddings = state["image_embeddings"]
    text_embeddings = state["text_embeddings"]
    
    combined_text = "\n\n--- DOCUMENT SEPARATOR ---\n\n".join([
        f"**File: {doc['filename']}**\n\n{doc['markdown']}" 
        for doc in documents
    ])
    
    # Calculate multimodal insights
    similarity_analysis = ""
    if text_embeddings and image_embeddings:
        text_avg = np.mean(text_embeddings, axis=0)
        img_avg = np.mean(image_embeddings, axis=0)
        similarity = np.dot(text_avg, img_avg) / (
            np.linalg.norm(text_avg) * np.linalg.norm(img_avg)
        )
        similarity_analysis = f"Text-Image semantic similarity: {similarity:.3f}"
    
    analysis_prompt = f"""
    You are an intelligent document analyzer with CLIP-based multimodal understanding.
    
    I have provided you with:
    - {len(documents)} markdown documents with CLIP text embeddings
    - {len(image_paths)} images with CLIP visual embeddings
    - {similarity_analysis}
    
    Please analyze this multimodal content and provide:
    1. A comprehensive summary incorporating both text and visual elements
    2. Key insights from the semantic alignment between text and images  
    3. Visual themes and patterns identified through CLIP analysis
    4. Specific questions for spaced repetition learning that combine text and visual understanding
    5. How the visual elements enhance the textual content
    
    Text Content:
    {combined_text[:3000]}
    """
    
    try:
        print("🔄 Calling Groq API...")
        
        # Make the API call
        response = client.chat.completions.create(
            messages=[{
                "role": "user",
                "content": analysis_prompt
            }],
            model="llama-3.3-70b-versatile",
            max_tokens=1500,
            temperature=0.7
        )
        
        # DEBUG: Print response structure to understand what we're getting
        print(f"🔍 Response type: {type(response)}")
        print(f"🔍 Response object: {response}")
        
        # FIXED: Handle different response structures
        try:
            # Method 1: Standard response structure
            if hasattr(response, 'choices') and response.choices:
                if isinstance(response.choices, list):
                    # If choices is a list, get the first element
                    first_choice = response.choices[0]
                    if hasattr(first_choice, 'message'):
                        llm_response = first_choice.message.content
                    elif hasattr(first_choice, 'text'):
                        llm_response = first_choice.text
                    else:
                        llm_response = str(first_choice)
                else:
                    # If choices is not a list, try to access message directly
                    llm_response = response.choices.message.content
            
            # Method 2: Direct response content
            elif hasattr(response, 'content'):
                llm_response = response.content
            
            # Method 3: Response is a string
            elif isinstance(response, str):
                llm_response = response
            
            # Method 4: Response is a dict
            elif isinstance(response, dict):
                if 'choices' in response and response['choices']:
                    llm_response = response['choices'][0]['message']['content']
                elif 'content' in response:
                    llm_response = response['content']
                else:
                    llm_response = str(response)
            
            # Fallback: Convert to string
            else:
                llm_response = f"Unexpected response format: {str(response)}"
                
        except Exception as parse_error:
            llm_response = f"""
            ❌ Error parsing Groq response: {str(parse_error)}
            
            Raw response type: {type(response)}
            Raw response: {str(response)[:500]}...
            
            This suggests the API response format has changed.
            """
        
        # Add embedding statistics
        embedding_stats = f"""
        
        --- CLIP EMBEDDING ANALYSIS ---
        • Image embeddings: {len(image_embeddings)} vectors (512D each)
        • Text embeddings: {len(text_embeddings)} vectors (512D each)  
        • Ready for multimodal question generation and spaced repetition!
        """
        
        llm_response += embedding_stats
        
        print("🤖 Groq processing with CLIP embeddings completed successfully!")
        
    except Exception as e:
        llm_response = f"""
        ❌ Groq API Error: {str(e)}
        
        Your CLIP embeddings were generated successfully:
        - {len(image_embeddings)} image embeddings
        - {len(text_embeddings)} text embeddings
        
        This multimodal data is ready for:
        • Question generation based on semantic similarity
        • Spaced repetition with visual-text alignment
        • Cross-modal retrieval for enhanced learning
        
        You can proceed with building your question-answering system using these embeddings!
        """
        print(f"❌ Groq API error: {str(e)}")
    
    return {
        **state,
        "processed_content": combined_text,
        "llm_response": llm_response
    }

# Create the fixed workflow
def create_fixed_groq_workflow():
    workflow = StateGraph(GraphState)
    
    workflow.add_node("load_documents", load_documents)
    workflow.add_node("generate_clip_embeddings", generate_clip_embeddings)
    workflow.add_node("process_groq_fixed", process_with_groq_fixed)  # Fixed version
    
    workflow.set_entry_point("load_documents")
    workflow.add_edge("load_documents", "generate_clip_embeddings")
    workflow.add_edge("generate_clip_embeddings", "process_groq_fixed")
    workflow.add_edge("process_groq_fixed", END)
    
    return workflow.compile()

# Main execution
if __name__ == "__main__":
    print("🔧 FIXED: CLIP-Enhanced Document Processor with Groq")
    print("-" * 50)
    
    # Get your API key from https://console.groq.com/keys
    print("📋 Instructions:")
    print("1. Get your API key from: https://console.groq.com/keys")
    print("2. Replace 'gsk_your_actual_groq_api_key_here' in the code")
    print("3. Or set environment variable: export GROQ_API_KEY=your_key")
    
    folder_path = input("\nEnter folder path with markdown files and images: ").strip()
    if not folder_path:
        folder_path = "."
    
    # Use the fixed workflow
    workflow = create_fixed_groq_workflow()
    
    initial_state = GraphState(
        folder_path=folder_path,
        documents=[],
        image_paths=[],
        image_embeddings=[],
        text_embeddings=[],
        processed_content="",
        llm_response=""
    )
    
    result = workflow.invoke(initial_state)
    print("\n" + "="*60)
    print("🎉 PROCESSING COMPLETE!")
    print("="*60)
    print(result["llm_response"])

🔧 FIXED: CLIP-Enhanced Document Processor with Groq
--------------------------------------------------
📋 Instructions:
1. Get your API key from: https://console.groq.com/keys
2. Replace 'gsk_your_actual_groq_api_key_here' in the code
3. Or set environment variable: export GROQ_API_KEY=your_key
📁 Loading documents from: E:\7. Projects From Sem 3\RAG\data\rag_1_Intro_20250906_032403
✅ Loaded 1 markdown files and 4 images
🎨 Generating CLIP embeddings for 4 images
✅ Processed: Pasted image 20250714150825.png
✅ Processed: Pasted image 20250714163303.png
✅ Processed: Pasted image 20250714150825.png
✅ Processed: Pasted image 20250714163303.png
📝 Generating CLIP text embeddings for 1 texts
🔗 Generated 4 image embeddings and 1 text embeddings
✅ Using Groq API key: gsk_ST...M0ri
✅ Groq client initialized successfully
🔄 Calling Groq API...
🔍 Response type: <class 'groq.types.chat.chat_completion.ChatCompletion'>
🔍 Response object: ChatCompletion(id='chatcmpl-67f8e1ac-f697-4bc1-853a-1f256dce37cc',