In [None]:
# Project Configuration
PROJECT_ID = "your-project-id"  # 🚨 REPLACE WITH YOUR PROJECT ID
LOCATION = "us-central1"
DATASET_ID = "husqvarna_rag_dataset"
TABLE_ID = "manual_chunks"
BUCKET_NAME = f"{PROJECT_ID}-husqvarna-manual"
PDF_FILE_NAME = "701_Enduro_2024_US_en_OM_051156-054460_00sq_44pb.pdf"

print(f"🔧 Project: {PROJECT_ID}")
print(f"📍 Location: {LOCATION}")
print(f"🗄️ Dataset: {DATASET_ID}")
print(f"📊 Table: {TABLE_ID}")


In [None]:
import os
import json
import asyncio
from datetime import datetime

# Google Cloud imports
from google.cloud import bigquery
from google.cloud import storage
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel
import vertexai

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

print("✅ Dependencies imported successfully!")


In [None]:
def create_bigquery_dataset():
    """Create BigQuery dataset for the manual chunks."""
    bigquery_client = bigquery.Client(project=PROJECT_ID)
    dataset_ref = bigquery_client.dataset(DATASET_ID)
    
    try:
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = LOCATION
        dataset = bigquery_client.create_dataset(dataset)
        print(f"📊 Created dataset: {PROJECT_ID}.{DATASET_ID}")
    except Exception as e:
        print(f"📊 Dataset {DATASET_ID} already exists or error: {e}")
    
    return bigquery_client

def create_manual_chunks_table(client):
    """Create table to store manual chunks with embeddings."""
    schema = [
        bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("section", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("subsection", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
        bigquery.SchemaField("page_number", "INTEGER", mode="NULLABLE"),
        bigquery.SchemaField("chunk_type", "STRING", mode="NULLABLE"),
        bigquery.SchemaField("embedding", "REPEATED FLOAT", mode="NULLABLE"),
    ]
    
    table_ref = client.dataset(DATASET_ID).table(TABLE_ID)
    table = bigquery.Table(table_ref, schema=schema)
    
    try:
        table = client.create_table(table)
        print(f"📋 Created table: {PROJECT_ID}.{DATASET_ID}.{TABLE_ID}")
    except Exception as e:
        print(f"📋 Table {TABLE_ID} already exists or error: {e}")

# Setup BigQuery
bigquery_client = create_bigquery_dataset()
create_manual_chunks_table(bigquery_client)


In [None]:
def get_husqvarna_manual_chunks():
    """Get pre-extracted chunks from the Husqvarna 701 Enduro Owner's Manual."""
    
    manual_chunks = [
        {
            "id": "safety_001",
            "section": "2 SAFETY ADVICE",
            "subsection": "2.1 Use definition – intended use",
            "content": "The vehicle is designed and constructed to withstand the usual demands of regular traffic and use on gentle terrain (unpaved roads). This vehicle is not suitable for use on race tracks. This vehicle is only authorized for operation on public roads in its homologated version.",
            "page_number": 6,
            "chunk_type": "safety"
        },
        {
            "id": "safety_002",
            "section": "2 SAFETY ADVICE", 
            "subsection": "2.12 Safe operation",
            "content": "Danger of accidents: A rider who is not fit to ride poses a danger to him or herself and others. Do not operate the vehicle if you are not fit to ride due to alcohol, drugs or medication. Do not operate the vehicle if you are physically or mentally impaired. Danger of poisoning: Exhaust gases are toxic and inhaling them may result in unconsciousness and death. Always make sure there is sufficient ventilation when running the engine.",
            "page_number": 11,
            "chunk_type": "warning"
        },
        {
            "id": "start_001",
            "section": "9 RIDING INSTRUCTIONS",
            "subsection": "9.2 Starting the vehicle",
            "content": "Turn the emergency OFF switch to the position. Switch on the ignition by turning the ignition key to the ON position. To avoid malfunctions in the control unit communication, do not switch the ignition off and on in rapid succession. Shift the transmission to neutral position. Green idle indicator lamp N lights up. Press start button. Do not press the start button until the combination instrument function check has finished. Do not open the throttle to start.",
            "page_number": 35,
            "chunk_type": "procedure"
        },
        {
            "id": "engine_oil_001",
            "section": "18 SERVICE WORK ON THE ENGINE",
            "subsection": "18.2 Checking the engine oil level", 
            "content": "Condition: The engine is at operating temperature. Stand the motorcycle upright on a horizontal surface. Check the engine oil level. After switching off the engine, wait one minute before checking the level. The engine oil must be between marking A and marking B of the oil level viewer. If the engine oil level is below the B mark: Add engine oil. If the engine oil level is above the A mark: Correct the engine oil level.",
            "page_number": 109,
            "chunk_type": "procedure"
        },
        {
            "id": "tire_specs_001",
            "section": "22 TECHNICAL SPECIFICATIONS",
            "subsection": "22.4 Chassis",
            "content": "Tire pressure, road, solo: front 1.8 bar (26 psi), rear 1.8 bar (26 psi). Tire pressure with passenger/full payload: front 2.2 bar (32 psi), rear 2.2 bar (32 psi). Tire pressure, offroad, solo: front 1.5 bar (22 psi), rear 1.5 bar (22 psi). Maximum permissible overall weight: 350 kg (772 lb.). Maximum permissible front axle load: 150 kg (331 lb.). Maximum permissible rear axle load: 200 kg (441 lb.).",
            "page_number": 125,
            "chunk_type": "specification"
        }
    ]
    
    return manual_chunks

# Load manual chunks
chunks = get_husqvarna_manual_chunks()
print(f"📚 Loaded {len(chunks)} manual chunks")

# Display sample chunk
print("\n📖 Sample chunk:")
sample = chunks[0]
print(f"ID: {sample['id']}")
print(f"Section: {sample['section']}")
print(f"Content: {sample['content'][:100]}...")
print(f"Type: {sample['chunk_type']}")


In [None]:
def setup_rag_models():
    """Set up the RAG model and query function."""
    generation_model = GenerativeModel("gemini-1.5-pro")
    embedding_model = GenerativeModel("textembedding-gecko@003")
    
    return generation_model, embedding_model

async def generate_embeddings_and_store(chunks):
    """Generate embeddings for manual chunks and store in BigQuery."""
    
    # Initialize models
    generation_model, embedding_model = setup_rag_models()
    
    print("🧠 Generating embeddings...")
    
    # Prepare data for BigQuery
    rows_to_insert = []
    
    for i, chunk in enumerate(chunks):
        print(f"  Processing chunk {i+1}/{len(chunks)}: {chunk['id']}")
        
        # Generate embedding for the content
        try:
            embedding_response = embedding_model.predict(chunk["content"])
            embedding = embedding_response.embeddings[0].values
        except Exception as e:
            print(f"  ❌ Error generating embedding for {chunk['id']}: {e}")
            continue
        
        # Prepare row for insertion
        row = {
            "id": chunk["id"],
            "section": chunk["section"],
            "subsection": chunk["subsection"],
            "content": chunk["content"],
            "page_number": chunk["page_number"],
            "chunk_type": chunk["chunk_type"],
            "embedding": embedding
        }
        
        rows_to_insert.append(row)
    
    # Insert into BigQuery
    if rows_to_insert:
        table_ref = bigquery_client.dataset(DATASET_ID).table(TABLE_ID)
        table = bigquery_client.get_table(table_ref)
        
        errors = bigquery_client.insert_rows_json(table, rows_to_insert)
        
        if errors:
            print(f"❌ Errors inserting rows: {errors}")
        else:
            print(f"✅ Successfully inserted {len(rows_to_insert)} chunks into BigQuery")
    
    return generation_model, embedding_model

# UNCOMMENT TO RUN (requires Vertex AI access)
# generation_model, embedding_model = await generate_embeddings_and_store(chunks)

print("⚠️ Embedding generation commented out - uncomment to run")
print("💡 This step requires Vertex AI access and will incur costs")


In [None]:
async def query_manual(question, generation_model, embedding_model, top_k=3):
    """Query the manual using RAG."""
    
    print(f"🔍 Processing question: {question}")
    
    # Generate embedding for the question
    try:
        question_embedding_response = embedding_model.predict(question)
        question_embedding = question_embedding_response.embeddings[0].values
    except Exception as e:
        return f"Error generating question embedding: {e}", []
    
    # Convert embedding to string format for SQL
    embedding_str = str(question_embedding)
    
    # Query BigQuery for similar chunks
    similarity_query = f"""
    SELECT 
        id,
        section,
        subsection,
        content,
        page_number,
        chunk_type,
        VECTOR_SEARCH(
            embedding,
            {embedding_str},
            top_k => {top_k}
        ) as similarity_score
    FROM `{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}`
    ORDER BY similarity_score DESC
    LIMIT {top_k}
    """
    
    try:
        # Execute the query
        query_job = bigquery_client.query(similarity_query)
        results = query_job.result()
        
        # Prepare context from retrieved chunks
        context_chunks = []
        for row in results:
            context_chunk = f"""
            Section: {row.section}
            Subsection: {row.subsection}
            Content: {row.content}
            Page: {row.page_number}
            """
            context_chunks.append(context_chunk)
        
        context = "\n\n".join(context_chunks)
        
    except Exception as e:
        return f"Error querying BigQuery: {e}", []
    
    # Create the prompt for Gemini
    prompt = f"""
    You are a helpful assistant for Husqvarna 701 Enduro motorcycle owners. 
    Use the following information from the owner's manual to answer the user's question.
    
    Context from Manual:
    {context}
    
    Question: {question}
    
    Instructions:
    - Provide a clear, accurate answer based on the manual content
    - Include specific details like measurements, procedures, or warnings when relevant
    - If the information involves safety warnings, emphasize them
    - Reference the manual section and page number when helpful
    - If you cannot find the answer in the provided context, say so
    
    Answer:
    """
    
    # Generate response
    try:
        response = generation_model.generate_content(prompt)
        return response.text, context_chunks
    except Exception as e:
        return f"Error generating response: {e}", context_chunks

print("✅ RAG query function defined")
print("💡 This function requires the models to be initialized")


In [None]:
async def interactive_query():
    """Interactive interface for querying the manual."""
    
    # Initialize models (uncomment when ready)
    generation_model, embedding_model = setup_rag_models()
    
    print("\n🏍️  Husqvarna 701 Enduro Manual Assistant")
    print("Ask me anything about your motorcycle!")
    print("Type 'quit' to exit\n")
    
    # Example questions
    print("💡 Example questions:")
    examples = [
        "How do I check the engine oil level?",
        "What tire pressure should I use for off-road riding?",
        "How do I start the motorcycle?",
        "What should I do if the engine overheats?"
    ]
    
    for example in examples:
        print(f"   • {example}")
    print()
    
    while True:
        question = input("Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print("Thanks for using the Husqvarna Manual Assistant!")
            break
        
        if not question:
            continue
        
        # UNCOMMENT TO RUN ACTUAL QUERIES
        try:
            print("\nSearching manual...")
            answer, context = await query_manual(question, generation_model, embedding_model)
            
            print(f"\n📖 Answer:")
            print(answer)
            
            print(f"\n📚 Sources: Found {len(context)} relevant sections")
            
        except Exception as e:
            print(f"❌ Error: {e}")
        
        print("\n" + "-" * 50)

# Example single query test
async def test_single_query():
    """Test with a single query."""
    generation_model, embedding_model = setup_rag_models()
    
    question = "How do I check the engine oil level?"
    print(f"🔍 Testing question: {question}")
    
    # UNCOMMENT TO RUN
    # answer, context = await query_manual(question, generation_model, embedding_model)
    # print(f"\n📖 Answer: {answer}")
    
    print("⚠️ Query execution commented out - requires embeddings in BigQuery")

print("💬 Interactive functions defined")
print("💡 Uncomment function calls to enable querying")

# UNCOMMENT TO TEST
# await test_single_query()
# await interactive_query()
