In [None]:
from pinecone import Pinecone,ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key="",  # Replace with your Pinecone API key
)

# # Create or connect to an index

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(index_name, dimension=1536)  # Use 1536 for OpenAI embeddings
# index = pinecone.Index(index_name)

index_name = "magic-forest"
pc.create_index(
    name=index_name,
    dimension=1536, # Replace with your model dimensions
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)
index = pc.Index(index_name)

In [6]:
import google.auth
from google.cloud import storage
import pandas as pd
import io

# Attempt to get the default credentials and project ID
credentials, project_id = google.auth.default()
print("Project ID:", project_id)



Project ID: versatile-gist-445618-d0


In [7]:
client = storage.Client(credentials=credentials, project=project_id)


In [15]:
# Replace with your bucket name and file path
bucket_name = 'magic_forest_plain'
file_path = '01-pilot.md'

# Access the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_path)



In [18]:
import re
from google.cloud import storage
import google.auth

# -------------------------------
# Step 1: Authenticate and Initialize the GCS Client
# -------------------------------
credentials, project_id = google.auth.default()
client = storage.Client(credentials=credentials, project=project_id)

# Replace with your actual bucket name.
bucket_name = "magic_forest_plain"
bucket = client.get_bucket(bucket_name)

# -------------------------------
# Step 2: List the Chapter Files
# -------------------------------
# If your 13 files are stored under a common prefix, e.g., "chapters/", specify that prefix.
blobs = bucket.list_blobs()

# Filter for Markdown files (ending in .md) and sort them.
chapter_files = sorted([blob.name for blob in blobs if blob.name.endswith('.md')])
print("Found chapter files:")
for f in chapter_files:
    print("  ", f)

# -------------------------------
# Step 3: Define a Function to Split Text into Chunks
# -------------------------------
def split_text(text, max_chunk_length=500):
    """
    Splits text into smaller chunks with each chunk not exceeding max_chunk_length characters.
    This implementation uses sentence boundaries.
    """
    # Split text by sentence endings.
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        # If adding this sentence exceeds our max length, save the current chunk and start a new one.
        if len(current_chunk) + len(sentence) > max_chunk_length:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk += " " + sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# -------------------------------
# Step 4: Process Each Chapter File and Prepare Data for Indexing
# -------------------------------
data_for_indexing = []  # This list will hold tuples of (vector_id, text_chunk, metadata)

for file_index, file_name in enumerate(chapter_files, start=1):
    # Download the chapter file.
    blob = bucket.blob(file_name)
    chapter_text = blob.download_as_string().decode('utf-8')

    # Optionally, extract a chapter title.
    # For example, assume the first non-empty line is the chapter title.
    lines = chapter_text.splitlines()
    chapter_title = None
    for line in lines:
        if line.strip():
            chapter_title = line.strip()
            break
    if not chapter_title:
        chapter_title = f"Chapter {file_index}"

    # Split the chapter text into smaller chunks.
    chunks = split_text(chapter_text, max_chunk_length=500)

    # For each chunk, prepare a unique ID and metadata.
    for chunk_index, chunk in enumerate(chunks):
        vector_id = f"chapter{file_index}_chunk{chunk_index}"
        metadata = {
            "chapter_file": file_name,
            "chapter_number": file_index,
            "chapter_title": chapter_title,
            "chunk_index": chunk_index,
            "text": chunk  # include the chunk text as metadata for reference (optional)
        }
        data_for_indexing.append((vector_id, chunk, metadata))

print(f"Prepared data for indexing. Total chunks: {len(data_for_indexing)}")

# -------------------------------
# (Optional) Example: Print the First Prepared Entry
# -------------------------------
if data_for_indexing:
    example_id, example_text, example_metadata = data_for_indexing[0]
    print("\nExample prepared entry:")
    print("ID:", example_id)
    print("Metadata:", example_metadata)
    print("Text snippet:", example_text[:200])


Found chapter files:
   01-pilot.md
   02-enchanted-river-quest.md
   03-quest-for-the-lost-amulet.md
   04-mystery-of-moonlit-lake.md
   05-great-garden-rescue.md
   06-tale-of-hidden-truth.md
   07-adventure-of-the-whispering-woods.md
   08-ocean-of-wonders.md
   09-adventure-of-the-starry-sky.md
   10-autumn-enchantment.md
   11-enchanted-waterfall.md
   12-truthful-treetop.md
   13-winter-wish.md
Prepared data for indexing. Total chunks: 95

Example prepared entry:
ID: chapter1_chunk0
Metadata: {'chapter_file': '01-pilot.md', 'chapter_number': 1, 'chapter_title': '**The Magical Forest Adventure**', 'chunk_index': 0, 'text': '**The Magical Forest Adventure**\n\nOnce upon a time, in a village nestled by a great forest, lived a curious little girl named Lily and her brave younger brother, Tom. Every night, they would dream of exploring the magical forest just beyond their home.\n\nOne sunny day, they decided to embark on an adventure. With a small backpack filled with snacks and a map

In [19]:
print(data_for_indexing[0])

('chapter1_chunk0', '**The Magical Forest Adventure**\n\nOnce upon a time, in a village nestled by a great forest, lived a curious little girl named Lily and her brave younger brother, Tom. Every night, they would dream of exploring the magical forest just beyond their home.\n\nOne sunny day, they decided to embark on an adventure. With a small backpack filled with snacks and a map drawn by their grandfather, they set off into the forest. As they walked, they discovered the forest was full of wonders.', {'chapter_file': '01-pilot.md', 'chapter_number': 1, 'chapter_title': '**The Magical Forest Adventure**', 'chunk_index': 0, 'text': '**The Magical Forest Adventure**\n\nOnce upon a time, in a village nestled by a great forest, lived a curious little girl named Lily and her brave younger brother, Tom. Every night, they would dream of exploring the magical forest just beyond their home.\n\nOne sunny day, they decided to embark on an adventure. With a small backpack filled with snacks and 

In [22]:
import openai

# Set your OpenAI API key
openai.api_key = ""

def get_embedding(text, model="text-embedding-ada-002"):
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# Generate embeddings for all chunks
vectors = []  # List to hold tuples (vector_id, embedding, metadata)
for vector_id, chunk, metadata in data_for_indexing:
    try:
        embedding = get_embedding(chunk)
        vectors.append((vector_id, embedding, metadata))
        print(f"Generated embedding for {vector_id}")
    except Exception as e:
        print(f"Error generating embedding for {vector_id}: {e}")


Generated embedding for chapter1_chunk0
Generated embedding for chapter1_chunk1
Generated embedding for chapter1_chunk2
Generated embedding for chapter1_chunk3
Generated embedding for chapter1_chunk4
Generated embedding for chapter2_chunk0
Generated embedding for chapter2_chunk1
Generated embedding for chapter2_chunk2
Generated embedding for chapter2_chunk3
Generated embedding for chapter2_chunk4
Generated embedding for chapter2_chunk5
Generated embedding for chapter3_chunk0
Generated embedding for chapter3_chunk1
Generated embedding for chapter3_chunk2
Generated embedding for chapter3_chunk3
Generated embedding for chapter3_chunk4
Generated embedding for chapter4_chunk0
Generated embedding for chapter4_chunk1
Generated embedding for chapter4_chunk2
Generated embedding for chapter4_chunk3
Generated embedding for chapter4_chunk4
Generated embedding for chapter4_chunk5
Generated embedding for chapter4_chunk6
Generated embedding for chapter5_chunk0
Generated embedding for chapter5_chunk1


In [26]:
from pinecone import Pinecone,ServerlessSpec

# Initialize Pinecone
pc = Pinecone(api_key="",  # Replace with your Pinecone API key
)

# # Create or connect to an index

# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(index_name, dimension=1536)  # Use 1536 for OpenAI embeddings
# index = pinecone.Index(index_name)

index_name = "magic-forest"

index = pc.Index(index_name)

# Determine embedding dimension (for text-embedding-ada-002, it's typically 1536)
embedding_dimension = len(vectors[0][1]) if vectors else 1536


# Upsert the vectors in batches (if many vectors, batching helps)
batch_size = 100
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    # Pinecone upsert expects a list of tuples: (id, vector, metadata)
    index.upsert(vectors=batch)
    print(f"Upserted batch {i // batch_size + 1}")


Upserted batch 1


In [29]:
def search_book(query, top_k=3):
    """
    Search the indexed book for the most relevant chunks based on a query.

    :param query: User's search string
    :param top_k: Number of results to return
    :return: List of matching text chunks with metadata
    """
    # Convert the query to an embedding
    query_embedding = get_embedding(query)

    # Search in Pinecone
    query_response = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Extract results
    results = []
    for match in query_response["matches"]:
        metadata = match["metadata"]
        results.append({
            "id": match["id"],
            "score": match["score"],
            "chapter_number": metadata["chapter_number"],
            "chapter_title": metadata["chapter_title"],
            "text_snippet": metadata["text"][:300],  # Show first 300 characters
        })

    return results


In [31]:
query = "who is Felix"
results = search_book(query, top_k=5)

print("\n🔍 **Search Results:**\n")
for result in results:
    print(f"📖 **Chapter {result['chapter_number']}: {result['chapter_title']}**")
    print(f"🔹 **Relevance Score:** {result['score']:.4f}")
    print(f"📝 **Snippet:** {result['text_snippet']}...\n")




🔍 **Search Results:**

📖 **Chapter 11.0: **Lily, Tom, and the Enchanted Waterfall****
🔹 **Relevance Score:** 0.7191
📝 **Snippet:** "What’s going on, Felix?"

Felix took a deep breath. "The waterfall… it’s stopped flowing! The animals are worried because it brings life to the forest. Without its water, the flowers are wilting, and the animals are losing their sparkle."

Tom held up his never-lost compass, which immediately point...

📖 **Chapter 13.0: **The Winter Wish****
🔹 **Relevance Score:** 0.7158
📝 **Snippet:** He’s known for stealing shiny things to hoard in his frozen lair.”

“We have to get it back!” Lily said firmly.

Felix led them through the snow-covered forest, their boots crunching softly in the frosty stillness. The trees sparkled with icicles, and the air smelled of pine and snow. As they walked...

📖 **Chapter 11.0: **Lily, Tom, and the Enchanted Waterfall****
🔹 **Relevance Score:** 0.7151
📝 **Snippet:** **Lily, Tom, and the Enchanted Waterfall**

One bright, sunny mor

In [32]:
def generate_text_from_retrieval(query, top_k=3, model="gpt-4", max_tokens=200):
    """
    Generates text based on retrieved relevant chunks from Pinecone.

    :param query: User's input query
    :param top_k: Number of retrieved chunks to use as context
    :param model: OpenAI model to use (e.g., "gpt-4", "gpt-3.5-turbo")
    :param max_tokens: Max tokens for output generation
    :return: Generated response from GPT
    """
    # Step 1: Retrieve the most relevant text chunks from Pinecone
    retrieved_results = search_book(query, top_k=top_k)

    # Step 2: Format retrieved text as context
    retrieved_texts = "\n\n".join(
        [f"Chapter {r['chapter_number']}: {r['chapter_title']}\n{r['text_snippet']}" for r in retrieved_results]
    )

    # Step 3: Construct a detailed prompt for GPT
    prompt = f"""
    You are a helpful AI assistant specialized in summarizing children's stories.
    Below are some relevant excerpts from a children's book related to the user's query:

    {retrieved_texts}

    Based on the above content, answer the following question in a clear and engaging manner:

    **Question:** {query}
    """

    # Step 4: Use OpenAI's API to generate a response
    response = openai.chat.completions.create(
        model=model,
        messages=[{"role": "system", "content": "You are a helpful storytelling assistant."},
                  {"role": "user", "content": prompt}],
        max_tokens=max_tokens
    )

    return response.choices[0].message.content


In [39]:
query = "Does lily also have a tool?"
generated_text = generate_text_from_retrieval(query, top_k=3)

print("\n✨ **Generated Story Response:**\n")
print(generated_text)



✨ **Generated Story Response:**

Yes, Lily does have a tool. In the 'Magical Forest Adventure' chapter, Lily was granted a wish by the Queen. She chose the power to talk to animals, which, while not a physical tool like Tom's magical compass, is a special ability that functions as a tool for them in their various adventures.
