### Welcome to the Motiverse AI 2025 Hackathon at IISc Bangalore! This document contains starter code that might be helpful as you tackle one or both of the problems.
### Problem 1: Help Center Chatbot
####Using the information available on Motive’s public website, construct a much more capable chatbot

You are provided with a scraped and curated dataset of all the content from the motive public website

In [None]:
!pip install boto3

In [None]:
import os
import json
import boto3

API_KEYS_FILE = "api_keys.json"

def load_api_keys():
    """Load API keys from a JSON file."""
    if not os.path.exists(API_KEYS_FILE):
        print("API keys file not found.")
        return None
    
    with open(API_KEYS_FILE, "r") as f:
        keys = json.load(f)
    return keys

keys = load_api_keys()

GROQ_AI_key = keys.get("GROQ_AI_key2")
os.environ['GROQ_API_KEY'] = GROQ_AI_key


In [None]:
aws_access_key_id = ''
aws_secret_access_key = ''

### Load text data and save chunks

In [None]:
import boto3

# Create an S3 client
s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

def list_s3_bucket_contents(bucket_name):
    """
    Lists all objects in the specified S3 bucket.

    Args:
        bucket_name (str): The name of the S3 bucket
    """
    try:
        # List objects in the bucket
        response = s3_client.list_objects_v2(Bucket=bucket_name)

        if 'Contents' in response:
            print(f"Contents of bucket '{bucket_name}':")
            for obj in response['Contents']:
                print(f" - {obj['Key']} (Last Modified: {obj['LastModified']}, Size: {obj['Size']} bytes)")
        else:
            print(f"No objects found in bucket '{bucket_name}'.")

    except Exception as e:
        print(f"Error: {str(e)}")




In [None]:
import boto3
import json
import tempfile
import os

def read_s3_file(bucket_name, key_name, aws_access_key_id, aws_secret_access_key):
    """
    Reads a file from S3 and returns its content.

    Args:
        bucket_name (str): The S3 bucket name
        key_name (str): The S3 object key
        aws_access_key_id (str): AWS access key ID
        aws_secret_access_key (str): AWS secret access key

    Returns:
        str: The content of the file
    """
    try:
        # Create S3 client
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            temp_path = temp_file.name

        try:
            # Download file from S3
            s3_client.download_file(bucket_name, key_name, temp_path)

            # Read the file content
            with open(temp_path, 'r', encoding='utf-8') as f:
                content = f.read()

            return content

        finally:
            # Clean up temporary file
            if os.path.exists(temp_path):
                os.remove(temp_path)

    except Exception as e:
        print(f"Error reading file from S3: {str(e)}")
        raise

def process_web_content(content):
    """
    Processes the web content and creates a dictionary with page keys and data tags.

    Args:
        content (str): The content of the web_content.txt file

    Returns:
        dict: Dictionary with page keys and their corresponding data
    """
    pages = {}
    current_page = None
    current_data = []

    # Split content into lines
    lines = content.split('\n')

    for line in lines:
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check if line starts with a page marker (e.g., "PAGE:" or similar)
        if line.startswith('PAGE:') or line.startswith('URL:'):
            # If we have a current page, save its data
            if current_page:
                pages[current_page] = {
                    'data': '\n'.join(current_data),
                    'tags': extract_tags(current_data)
                }

            # Start new page
            current_page = line
            current_data = []
        else:
            # Add line to current page's data
            current_data.append(line)

    # Add the last page if exists
    if current_page and current_data:
        pages[current_page] = {
            'data': '\n'.join(current_data),
            'tags': extract_tags(current_data)
        }

    return pages

def extract_tags(data_lines):
    """
    Extracts tags from the data lines.

    Args:
        data_lines (list): List of data lines for a page

    Returns:
        list: List of extracted tags
    """
    tags = []
    for line in data_lines:
        # Look for tag markers (e.g., "TAG:", "CATEGORY:", etc.)
        if 'TAG:' in line or 'CATEGORY:' in line:
            tag = line.split(':', 1)[1].strip()
            if tag:
                tags.append(tag)
    return tags





In [None]:
bucket_name = 'motiverse-2025-data'
key_name = 'web_content.txt'

try:
    # Read the file from S3
    print(f"Reading {key_name} from {bucket_name}...")
    content = read_s3_file(bucket_name, key_name, aws_access_key_id, aws_secret_access_key)

    # Process the content
    print("Processing content...")
    pages = process_web_content(content)

    print(f"Processed {len(pages)} pages.")
    print('Page keys: ', pages['URL: https://gomotive.com/content-library/report/motive-2023-annual-roi-report-2023-10/'].keys())

    # Print sample of the processed data
    print("\nSample of processed data:")
    for page_key, page_data in list(pages.items())[:30]:  # Show first 2 pages as sample
        print(f"\nPage: {page_key}")
        print(f"Tags: {page_data['tags']}")
        print(f"Data preview: {page_data['data']}...")


except Exception as e:
    print(f"Error: {str(e)}")



**Chunk data**

In [None]:
# functions
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize Bedrock client
bedrock_client = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

def extract_info(text):
    metadata_match = re.search(r'METADATA:(.*?)\ncategories:', text, re.DOTALL)
    categories_match = re.search(r'categories:(.*?)\ntags:', text, re.DOTALL)
    tags_match = re.search(r'tags:(.*?)\n(CONTENT|EXCERPT):', text, re.DOTALL)
    content_match = re.search(r'(CONTENT|EXCERPT):(.*?)\n-+', text, re.DOTALL)
    title_match = re.search(r'TITLE:\s*(.+)', text)
    
    metadata = metadata_match.group(1).strip() if metadata_match else ''
    content_key = content_match.group(1) if content_match else 'CONTENT'
    content = content_match.group(2).strip() if content_match else ''
    categories = categories_match.group(1).strip() if categories_match else ''
    tags = tags_match.group(1).strip() if tags_match else ''
    title = title_match.group(1).strip() if title_match else 'Report...'

    extracted_info = {
        'METADATA': metadata,
        'categories': categories,
        'tags': tags,
        content_key: content,
        'TITLE': title
    }
    
    return extracted_info

def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

def chunk_sentences(sentences):
    """
    Splits sentences into chunks based on cosine distance between their embeddings.
    Args:
        sentences (list): List of sentences to be chunked
    Returns:
        list: List of chunked sentences
    """
    sentence_embeddings = sentences_emb_model([sentence['combined_sentence'] for sentence in sentences])

    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = sentence_embeddings[i]

    distances, sentences = calculate_cosine_distances(sentences)

    breakpoint_percentile_threshold = 95
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff

    # Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list
    # Initialize the start index
    start_index = 0

    # Create a list to hold the grouped sentences
    chunks = []

    # Iterate through the breakpoints to slice the sentences
    for index in indices_above_thresh:
        # The end index is the current breakpoint
        end_index = index

        # Slice the sentence_dicts from the current start index to the end index
        group = sentences[start_index:end_index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        
        # Update the start index for the next group
        start_index = index + 1

    # The last group, if any sentences remain
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    # Return the chunks
    return chunks

def run_anthropic_model(prompt_text, model_name = 'anthropic.claude-3-haiku-20240307-v1:0'):
    """
    Run the Anthropic model with the given prompt text and model name.
    
    Args:
        prompt_text (str): The prompt text to send to the model.
        model_name (str): The name of the model to use.
        
    Returns:
        str: The response from the model.
    """
    # Prepare the request body for Claude 3
    request_body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt_text
                    }
                ]
            }
        ],
        "temperature": 0.3,
        "top_p": 0.8,
        "top_k": 200
    }

    # Call Claude 3 model
    response = bedrock_client.invoke_model(
        modelId=model_name,
        body=json.dumps(request_body)
    )

    # Process the response
    response_body = json.loads(response.get("body").read().decode())
    reply = response_body.get("content", [])[0].get("text", "")
    
    return reply

def sentences_emb_model(sentences, model_id="cohere.embed-multilingual-v3"):
    """
    Run the embedding model on the given sentences.
    
    Args:
        model_name (str): The name of the embedding model to use.
        sentences (list): List of sentences to embed.
        
    Returns:
        list: List of embeddings for the sentences.
    """
    # print(len(sentences))
    # Request payload
    request_body = json.dumps({
        "texts": [sentence[-2048:] for sentence in sentences],  # Limit to 2048 tokens
        "input_type": "search_document"  # or "search_query", or "classification"
    })
    
    # Call the model
    response = bedrock_client.invoke_model(
        modelId=model_id,
        body=request_body.encode("utf-8"),
        contentType="application/json",
        accept="application/json"
    )

    # Parse the response
    response_body = json.loads(response["body"].read().decode("utf-8"))


    return response_body["embeddings"]




In [None]:
from tqdm import tqdm


# Dictionary with model names and their respective IDs on AWS Bedrock
models = {
    "Claude 3 Sonnet (200k)": "anthropic.claude-3-sonnet-20240229-v1:0",
    "Claude 3 Haiku (200k)": "anthropic.claude-3-haiku-20240307-v1:0",
}
chunks = []
text = """METADATA: {matadata}
categories: {categories}
tags: {tags}
{type_key}: {content}
TITLE: {title}"""
num=10


In [None]:
count = 0
for page_key, page_data in tqdm(pages.items(), desc="Processing pages", unit="page"):
    # Skip the first 207 pages

    if count == 248:
        continue
    # else:
    count += 1
    # Extract metadata, categories, tags, and content
    extracted_info = extract_info(page_data['data'])

    if 'CONTENT' in extracted_info.keys():
        k = 'CONTENT'
        para = extracted_info['CONTENT']
    elif 'EXCERPT' in extracted_info.keys():
        k = 'EXCERPT'
        para = extracted_info['EXCERPT']
    else:
        raise ValueError("Neither CONTENT nor EXCERPT found in the extracted information.")

    sentences = re.split(r'(?<=[.?!])\s+', para)
    sentences = [{'sentence': sentence} for sentence in sentences if sentence]
    # Combine sentences with a buffer size of 1
    # print(sentences)
    comb_sentences = combine_sentences(sentences, buffer_size=1)
    # Add the sentences to the chunks list
    if len(sentences) <= 1:
        temp_text = text.format(
            matadata=extracted_info['METADATA'],
            categories=extracted_info['categories'],
            tags=extracted_info['tags'],
            type_key=k,
            content=sentences[0]['sentence'] if len(sentences) > 0 else '',
            title=extracted_info['TITLE'],
        )
#         summary_prompt = """Summarize the given text in 1 or 2 sentences maximum, point at some key features of the text in {type_key}, TITLE, categories and tags: 
# {temp_text}""".format(
#             type_key=k,
#             temp_text=temp_text
#         )
#         # Generate the Groq response
#         summary = run_anthropic_model(str(summary_prompt), model_name=models["Claude 3 Haiku (200k)"])
        chunk = {
            'URL': re.search(r"https?://[^\s\"']+", page_key).group(),
            'text': temp_text,
            'embedding': sentences_emb_model([text]),
            # 'summary': summary,
            # 'summary_embedding': sentences_emb_model([summary]),
        }
        chunks.append(chunk)
        continue
    temp_chunks = chunk_sentences(sentences)
    for sentence in temp_chunks:
        # Add the URL, metadata, categories, tags, type, and title to the chunk
        temp_text = text.format(
            matadata=extracted_info['METADATA'],
            categories=extracted_info['categories'],
            tags=extracted_info['tags'],
            type_key=k,
            content=sentence,
            title=extracted_info['TITLE']
        )
#         summary_prompt = """Summarize the given text in 1 or 2 sentences maximum, point at some key features of the text in {type_key}, TITLE, categories and tags: 
# {temp_text}""".format(
#             type_key=k,
#             temp_text=temp_text
#         )
#         # Generate the Groq response
#         summary = run_anthropic_model(str(summary_prompt), model_name=models["Claude 3 Haiku (200k)"])

        chunk = {
            'URL': re.search(r"https?://[^\s\"']+", page_key).group(),
            'text': temp_text,
            'embedding': sentences_emb_model([temp_text]),
            # 'summary': summary,
            # 'summary_embedding': sentences_emb_model([summary]),
        }
        chunks.append(chunk)

In [None]:
import pickle


def save_pickle(data, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

# Save the chunks to a pickle file
save_pickle(chunks, 'chunks.pkl')

In [None]:
for item in chunks[:10]:
    print(item['text'])
    print(item['URL'])

### Load saved chunks

In [None]:
aws_access_key_id = ''
aws_secret_access_key = ''

In [None]:
# functions
import os
import json
import boto3
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize Bedrock client
bedrock_client = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

def extract_info(text):
    metadata_match = re.search(r'METADATA:(.*?)\ncategories:', text, re.DOTALL)
    categories_match = re.search(r'categories:(.*?)\ntags:', text, re.DOTALL)
    tags_match = re.search(r'tags:(.*?)\n(CONTENT|EXCERPT):', text, re.DOTALL)
    content_match = re.search(r'(CONTENT|EXCERPT):(.*?)\n-+', text, re.DOTALL)
    title_match = re.search(r'TITLE:\s*(.+)', text)
    
    metadata = metadata_match.group(1).strip() if metadata_match else ''
    content_key = content_match.group(1) if content_match else 'CONTENT'
    content = content_match.group(2).strip() if content_match else ''
    categories = categories_match.group(1).strip() if categories_match else ''
    tags = tags_match.group(1).strip() if tags_match else ''
    title = title_match.group(1).strip() if title_match else 'Report...'

    extracted_info = {
        'METADATA': metadata,
        'categories': categories,
        'tags': tags,
        content_key: content,
        'TITLE': title
    }
    
    return extracted_info

def combine_sentences(sentences, buffer_size=1):
    # Go through each sentence dict
    for i in range(len(sentences)):

        # Create a string that will hold the sentences which are joined
        combined_sentence = ''

        # Add sentences before the current one, based on the buffer size.
        for j in range(i - buffer_size, i):
            # Check if the index j is not negative (to avoid index out of range like on the first one)
            if j >= 0:
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += sentences[j]['sentence'] + ' '

        # Add the current sentence
        combined_sentence += sentences[i]['sentence']

        # Add sentences after the current one, based on the buffer size
        for j in range(i + 1, i + 1 + buffer_size):
            # Check if the index j is within the range of the sentences list
            if j < len(sentences):
                # Add the sentence at index j to the combined_sentence string
                combined_sentence += ' ' + sentences[j]['sentence']

        # Then add the whole thing to your dict
        # Store the combined sentence in the current sentence dict
        sentences[i]['combined_sentence'] = combined_sentence

    return sentences

def chunk_sentences(sentences):
    """
    Splits sentences into chunks based on cosine distance between their embeddings.
    Args:
        sentences (list): List of sentences to be chunked
    Returns:
        list: List of chunked sentences
    """
    sentence_embeddings = sentences_emb_model([sentence['combined_sentence'] for sentence in sentences])

    for i, sentence in enumerate(sentences):
        sentence['combined_sentence_embedding'] = sentence_embeddings[i]

    distances, sentences = calculate_cosine_distances(sentences)

    breakpoint_percentile_threshold = 95
    breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold) # If you want more chunks, lower the percentile cutoff

    # Then we'll get the index of the distances that are above the threshold. This will tell us where we should split our text
    indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list
    # Initialize the start index
    start_index = 0

    # Create a list to hold the grouped sentences
    chunks = []

    # Iterate through the breakpoints to slice the sentences
    for index in indices_above_thresh:
        # The end index is the current breakpoint
        end_index = index

        # Slice the sentence_dicts from the current start index to the end index
        group = sentences[start_index:end_index + 1]
        combined_text = ' '.join([d['sentence'] for d in group])
        chunks.append(combined_text)
        
        # Update the start index for the next group
        start_index = index + 1

    # The last group, if any sentences remain
    if start_index < len(sentences):
        combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
        chunks.append(combined_text)
    # Return the chunks
    return chunks

def run_anthropic_model(prompt_text, model_name = 'anthropic.claude-3-haiku-20240307-v1:0'):
    """
    Run the Anthropic model with the given prompt text and model name.
    
    Args:
        prompt_text (str): The prompt text to send to the model.
        model_name (str): The name of the model to use.
        
    Returns:
        str: The response from the model.
    """
    # Prepare the request body for Claude 3
    request_body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt_text
                    }
                ]
            }
        ],
        "temperature": 0.3,
        "top_p": 0.8,
        "top_k": 200
    }

    # Call Claude 3 model
    response = bedrock_client.invoke_model(
        modelId=model_name,
        body=json.dumps(request_body)
    )

    # Process the response
    response_body = json.loads(response.get("body").read().decode())
    reply = response_body.get("content", [])[0].get("text", "")

    return reply

def sentences_emb_model(sentences, model_id="cohere.embed-multilingual-v3"):
    """
    Run the embedding model on the given sentences.
    
    Args:
        model_name (str): The name of the embedding model to use.
        sentences (list): List of sentences to embed.
        
    Returns:
        list: List of embeddings for the sentences.
    """
    # print(len(sentences))
    # Request payload
    request_body = json.dumps({
        "texts": [sentence[-2048:] for sentence in sentences],  # Limit to 2048 tokens
        "input_type": "search_document"  # or "search_query", or "classification"
    })
    
    # Call the model
    response = bedrock_client.invoke_model(
        modelId=model_id,
        body=request_body.encode("utf-8"),
        contentType="application/json",
        accept="application/json"
    )

    # Parse the response
    response_body = json.loads(response["body"].read().decode("utf-8"))


    return response_body["embeddings"]



In [None]:
import pickle

def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

chunks = load_pickle('chunks.pkl')

In [None]:
import sys
import numpy as np
import re
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
model_name = "all-MiniLM-L6-v2"


def load_vector_db(vector_db_path='chunks.pkl'):
    """
    Load the FAISS index and metadata from the vector database.
    
    Args:
        vector_db_path: Path to the directory containing the vector database
        
    Returns:
        vector_db: chunked vector dataset
    """
    return load_pickle(vector_db_path)

def search(query_embedding, chunk_embeddings, k=5):
    """
    Perform a search in the vector database using the query embedding.
    
    Args:
        query_embedding: Embedding of the query
        chunk_embeddings: Embeddings of the chunks in the database
        k: Number of top results to return
        
    Returns:
        distances: Distances of the top k results
        indices: Indices of the top k results
    """
    # Calculate cosine similarity
    distances = []
    for i in range(len(chunk_embeddings)):
        embedding_current = chunk_embeddings[i]
        
        # Calculate cosine similarity
        similarity = cosine_similarity(query_embedding, embedding_current)[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)
    # Get top k results
    indices = np.argsort(distances)[:k]
    
    return distances, indices

def search_vector_db(query, vector_db_info, top_k=5, rerank=True):
    """
    Search the vector database for the most similar chunks to the query.
    
    Args:
        query: Query text
        top_k: Number of top results to return
        vector_db_path: Path to the vector database directory
        rerank: Whether to rerank results using more sophisticated similarity
        
    Returns:
        List of dictionaries containing chunk info and similarity scores
    """
    # Load the vector database
    chunk_texts = [chunk['text'] for chunk in vector_db_info]
    chunk_txt_emb = [chunk['embedding'] for chunk in vector_db_info]
    chunk_urls = [chunk['URL'] for chunk in vector_db_info]

    # Encode the query
    query_embedding = sentences_emb_model([query])

    # Search the index - retrieve more results than needed for reranking
    k_search = top_k * 3 if rerank else top_k
    k_search = min(k_search, len(chunk_texts))  # Don't try to get more results than we have chunks

    distances, indices = search(query_embedding, chunk_txt_emb, k_search)

    # Process results
    results = []
    for i, idx in enumerate(indices):
        if idx != -1:  # Valid index
            chunk_text = chunk_texts[idx]
            similarity = float(distances[i])
            results.append({
                "text": chunk_text,
                "url": chunk_urls[idx],
                "similarity": similarity
            })

    return results

def get_surrounding_context(result, metadata, context_chunks=1):
    """
    Get surrounding chunks as context for a result.
    
    Args:
        result: A single result dictionary from search_vector_db
        metadata: Metadata from the vector database
        context_chunks: Number of chunks to include before and after
        
    Returns:
        String with the context including the result chunk
    """
    chunks = metadata["chunks"]
    chunk_metadata = metadata["chunk_metadata"]
    
    file_path = result["file_path"]
    chunk_index = result["chunk_index"]
    
    # Find chunks from the same file
    same_file_chunks = []
    chunk_indices = []
    
    for i, info in enumerate(chunk_metadata):
        if info["file_path"] == file_path:
            same_file_chunks.append(chunks[i])
            chunk_indices.append(info["chunk_index"])
    
    # Sort by chunk index
    sorted_chunks = [x for _, x in sorted(zip(chunk_indices, same_file_chunks))]
    
    # Find the index of our chunk in the sorted list
    file_chunk_index = chunk_indices.index(chunk_index)
    
    # Get context chunks
    start_idx = max(0, file_chunk_index - context_chunks)
    end_idx = min(len(sorted_chunks), file_chunk_index + context_chunks + 1)
    
    context_text = "\n\n".join(sorted_chunks[start_idx:end_idx])
    return context_text

def retrive(query, vector_db_info, top_k=3):
    """
    Perform retrival for specific the query for "What is attention?" and format the results.
    
    Args:
        query: Query text 
        top_k: Number of top results to return
        vector_db_path: Path to the vector database directory
        
    Returns:
        None, prints formatted results
    """
    # print(f"Searching for: '{query}'")
    
    # Get search results
    results = search_vector_db(query, vector_db_info, top_k=top_k)
    
    return results

def date_time_stamp(format = "%m%d%H%M%S"):
    return datetime.now().strftime(format)

def extract_json(text):
    """
    Detect and extract JSON content from a given text.
    Supports cases where JSON is wrapped inside additional text.
    """
    # Try to extract JSON array from text (Variation Two)
    json_match = re.search(r"\[\s*{.*?}\s*\]", text, re.DOTALL)
    
    if json_match:
        json_str = json_match.group(0)
        try:
            parsed_json = json.loads(json_str)
            return parsed_json
        except json.JSONDecodeError:
            return None  # Invalid JSON format inside text
    
    return None  # No JSON found

def remove_think_section(text, tag='think'):
    if not text:
        return text
    elif tag == 'think':
        return re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)
    elif tag == 'response':
        return re.sub(r"<response>.*?</response>\s*", "", text, flags=re.DOTALL)
    else:
        raise ValueError(f"Unknown tag: {tag}")

def print_urls_nicely(urls):
    """
    Prints extracted URLs in a clean and visually appealing manner using print().
    """
    print("\n=== References ===\n")
    for i, url in enumerate(urls):
        print(f"{i+1}. 📌 {url}\n")

def extract_url_and_context(documents):
    """
    Extracts URLs and relevant context from retrieved documents.
    Cleans up the content by removing metadata and irrelevant tags.
    """
    extracted_data = []
    context_text = ""
    for i, doc in enumerate(documents):
        url = doc.get("url", "No URL Provided")
        
        # Extracting content from text, ignoring metadata
        text = doc.get("text", "")
        cleaned_text = text.split("CONTENT:")[-1].strip()  # Extract content after "CONTENT:"
        context_text += f"Information chunk {i+1}:\n" + cleaned_text + "\n\n"
        
        if not url in extracted_data:
            extracted_data.append(url)

    return extracted_data, context_text

def remove_response_tags(text):
    """
    Removes <response> tags and extracts the clean text.
    """
    cleaned_text = re.sub(r"</?response>", "", text).strip()
    return cleaned_text





In [None]:
import time

class RAG_chatbot:
    def __init__(self, api_key: str = None):
        self.history = ''
        # self.doc_gen_model = 'gemma2-9b-it'
        self.doc_gen_model = 'llama-3.1-8b-instant'
        self.s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )
        # Dictionary with model names and their respective IDs on AWS Bedrock
        models = {
            "Claude 3 Sonnet (200k)": "anthropic.claude-3-sonnet-20240229-v1:0",
            "Claude 3 Haiku (200k)": "anthropic.claude-3-haiku-20240307-v1:0",
        }
        self.gen_model = models['Claude 3 Haiku (200k)']

        # Initialize Bedrock client
        self.bedrock_client = boto3.client(
            service_name='bedrock-runtime',
            region_name='us-east-1',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        self.vector_db_info = load_vector_db()

        with open("prompt.txt", "r") as file:
            self.chat_prompt = file.read()
        
        with open("prompt_gen.txt", "r") as file:
            self.gen_prompt = file.read()
        
    def query(self, query: str):
        
        s_time = date_time_stamp('%H:%M:%S')
        start_time = time.time()  # Record the start time



        # RETRIVAL PIPE-LINE
        # generate duplicate questions

        # Load prompt for generation
        message = self.gen_prompt.format(question=query)

        # generate questions using the model in Json format
        generate_doc = run_anthropic_model(message)
        # extract the questions from the response
        questions = extract_json(generate_doc)
        results = []
        # retrival for user query
        results.append(retrive(query, self.vector_db_info, top_k=3)[0])
        for question in questions:
            # retrive using generated questions
            Q = question['Question']+'\n' + question['Key_features']
            result = retrive(Q, self.vector_db_info, top_k=3)
            if result:
                results.append(result[0])
        # Extract URLs and context from the retrieved documents
        urls, context = extract_url_and_context(results[:3])



        # GENERATE RESPONSE using the retrieved document
        message = [
            {
            "role": "user",
            "content": self.chat_prompt.format(history=self.history, query=query, context=context)
            }
        ]
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 4096,
            "messages": message,
            "temperature": 0.3,
            "top_p": 0.8,
            "top_k": 200
        }

        # Call Claude 3 model
        response = self.bedrock_client.invoke_model(
            modelId=self.gen_model,
            body=json.dumps(request_body)
        )
        response_body = json.loads(response.get("body").read().decode())
        reply = (response_body.get("content", [])[0].get("text", ""))
        reply = remove_response_tags(reply)

        latency = time.time() - start_time  # Calculate latency

        self.json_history.append(
            {
                'query': query,
                'context': context,
                'response': reply,
                'latency': latency,
                'start_time': s_time,
            }
        )
        
        with open(self.file_path, 'w') as f:
            json.dump(self.json_history, f, indent=4)
        
        self.history += '\n' + 'You: ' + query + '\n' + 'Motiv-ator: ' + reply
        print('\n' + 'You: ' + query + '\n' + 'Motiv-ator: ' + reply)
        if urls:
            print_urls_nicely(urls)
        return
    

    def Start_conversation(self):
        """
        Start a conversation with the chatbot.
        Args:
            save_trace: Whether to save the trace data
        """
        # create directory for json files
        time_stamp = date_time_stamp()
        file_path = time_stamp+'.json'
        self.json_history = []
        self.file_path = file_path
        with open("Chat_greeting.txt", "r") as file:
            print(file.read())
        count = 0
        while True:
            count+=1
            sys.stdout.flush()
            query = input("You: ")
            if not query or query.lower() == "exit" or query.lower() == "quit":
                break
            _ = self.query(query)
        print("Thank you for your time and Goodbye :)")
        return self.json_history

bot = RAG_chatbot()
# b = bot.query("why motive over lynx?")


In [None]:
chat_history = bot.Start_conversation()

### Problem 2: Video-Text Index
####Design a system that enables a set of videos to be searchable.

You are provided a dataset of 500 videos with which to build and test your solution. The 500 videos are comprised of the following categories
* 50 collision videos
* 50 tailgating videos
* 50 stop sign violation videos
* 50 red light violation videos
* 300 random videos correponding to hard brakes, corners, and accelerations

In [None]:
!pip install boto3

#### Sample snippets to access the videos


In [None]:
import pandas as pd
import boto3
import io

def read_s3_csv_to_dataframe(bucket_name, key_name, aws_access_key_id, aws_secret_access_key):
    """Reads a CSV file from S3 into a pandas DataFrame.

    Args:
        bucket_name (str): The S3 bucket name.
        key_name (str): The S3 object key (file name).
        aws_access_key_id (str): Your AWS access key ID.
        aws_secret_access_key (str): Your AWS secret access key.

    Returns:
        pandas.DataFrame: The DataFrame containing the CSV data, or None if an error occurs.
    """
    try:
        s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
        obj = s3.get_object(Bucket=bucket_name, Key=key_name)
        df = pd.read_csv(io.BytesIO(obj['Body'].read()), sep='\t')
        return df
    except Exception as e:
        print(f"Error reading CSV from S3: {str(e)}")
        return None

bucket_name = 'motiverse-2025-data'  # Replace with your bucket name
key_name = 'video_index.tsv'  # Replace with your CSV file name in S3
df = read_s3_csv_to_dataframe(bucket_name, key_name, aws_access_key_id, aws_secret_access_key)



In [None]:
df.head(50)

In [None]:
import boto3
import tempfile
import os
from IPython.display import HTML, display
from base64 import b64encode

def render_s3_video(bucket_name, key_name, width=640):
    """
    Downloads an MP4 file from S3 and renders it in a Colab notebook.

    Args:
        bucket_name (str): The S3 bucket name
        key_name (str): The S3 object key (path to the MP4 file)
        width (int): Width to display the video (in pixels)
    """
    # Create S3 client - this will use credentials from environment or instance profile
    # Pass credentials as keyword arguments
    s3_client = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

    # Create a temporary file to store the video
    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
        temp_path = temp_file.name

    try:
        # Download the file from S3
        print(f"Downloading video from s3://{bucket_name}/{key_name}...")
        s3_client.download_file(bucket_name, key_name, temp_path)

        # Read the file content
        with open(temp_path, 'rb') as f:
            video_data = f.read()

        # Convert to base64
        video_base64 = b64encode(video_data).decode('utf-8')

        # Display the video
        video_html = f"""
        <video width="{width}" controls>
          <source src="data:video/mp4;base64,{video_base64}" type="video/mp4">
          Your browser does not support the video tag.
        </video>
        """

        display(HTML(video_html))
        print(f"Video displayed successfully!")

    except Exception as e:
        print(f"Error: {str(e)}")

    finally:
        # Clean up the temp file
        if os.path.exists(temp_path):
            os.remove(temp_path)

# Example usage
# render_s3_video('my-bucket', 'path/to/video.mp4')

In [None]:
render_s3_video('motiverse-2025-data', 'videos/3043711272.mp4')

#### List of foundational LLMs aviailable from Amazon BedRock

In [None]:
# Initialize the Bedrock client
bedrock_client = boto3.client(
    service_name='bedrock',
    region_name='us-east-1',  # Adjust region as needed
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

def list_foundation_models():
    """Lists available foundation models in Bedrock."""
    try:
        response = bedrock_client.list_foundation_models()
        models = response.get('modelSummaries', [])

        if models:
            print("Available Foundation Models:")
            for model in models:
                print(f" - {model['modelId']}: {model['modelArn']}")
        else:
            print("No foundation models found.")

    except Exception as e:
        print(f"Error: {str(e)}")

# Example usage
list_foundation_models()

#### Sample LLM call with video and prompt

In [None]:
import boto3
import json
import cv2
import base64
import os
import tempfile
from datetime import datetime
from IPython.display import HTML, display

def encode_image_to_base64(frame):
    """Convert OpenCV frame to base64 string."""
    _, buffer = cv2.imencode('.jpg', frame)
    return base64.b64encode(buffer).decode('utf-8')

def process_frames_batch(bedrock_client, frames, prompt):
    """Process a batch of frames using Claude 3."""
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]

    # Add frames to the messages
    for frame in frames:
        base64_image = encode_image_to_base64(frame)
        messages[0]["content"].append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": base64_image
            }
        })

    # Prepare the request body for Claude 3
    request_body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 4096,
        "messages": messages,
        "temperature": 0.3,
        "top_p": 0.8,
        "top_k": 200
    }

    # Call Claude 3 model
    response = bedrock_client.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=json.dumps(request_body)
    )

    # Process the response
    response_body = json.loads(response.get("body").read().decode())
    return response_body.get("content", [])[0].get("text", "")

def process_s3_video(bucket_name, key_name, aws_access_key_id, aws_secret_access_key, width=640, batch_size=5):
    """
    Downloads a video from S3, processes it for snow detection, and displays it.

    Args:
        bucket_name (str): The S3 bucket name
        key_name (str): The S3 object key (path to the MP4 file)
        aws_access_key_id (str): AWS access key ID
        aws_secret_access_key (str): AWS secret access key
        width (int): Width to display the video (in pixels)
        batch_size (int): Number of frames to process in each batch
    """
    try:
        # Initialize S3 client
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        # Initialize Bedrock client
        bedrock_client = boto3.client(
            service_name='bedrock-runtime',
            region_name='us-east-1',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        # Create temporary file for video
        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as temp_file:
            temp_path = temp_file.name

        try:
            # Download video from S3
            s3_client.download_file(bucket_name, key_name, temp_path)

            # Display video
            with open(temp_path, 'rb') as f:
                video_data = f.read()
            video_base64 = base64.b64encode(video_data).decode('utf-8')
            video_html = f"""
            <video width="{width}" controls>
              <source src="data:video/mp4;base64,{video_base64}" type="video/mp4">
              Your browser does not support the video tag.
            </video>
            """
            display(HTML(video_html))

            # Process video for snow detection
            video = cv2.VideoCapture(temp_path)
            if not video.isOpened():
                raise Exception(f"Could not open video file: {temp_path}")

            # Extract frames (every 3rd frame for better snow detection)
            frames = []
            frame_count = 0
            while video.isOpened():
                ret, frame = video.read()
                if not ret:
                    break
                if frame_count % 3 == 0:  # Extract every 3rd frame for more frequent sampling
                    frames.append(frame)
                frame_count += 1
            video.release()

            # Prepare the prompt for snow detection
            prompt = """Analyze the following video frames and determine if there is snow present.
            Look for these specific indicators of snow:
            1. White or light-colored ground cover
            2. Snowflakes falling in the air
            3. Snow accumulation on surfaces
            4. People or objects covered in snow
            5. Snow-related activities (snowball fights, skiing, etc.)

            For each frame, provide:
            - Whether snow is present (Yes/No)
            - Confidence level (High/Medium/Low)
            - Specific evidence of snow if present

            Finally, provide an overall assessment:
            - Is snow present in the video? (Yes/No)
            - What percentage of frames show snow?
            - Any notable changes in snow conditions throughout the video?"""

            # Process frames in batches
            all_analyses = []
            for i in range(0, len(frames), batch_size):
                batch = frames[i:i + batch_size]
                print(f"Processing batch {i//batch_size + 1} of {(len(frames) + batch_size - 1)//batch_size}")
                batch_analysis = process_frames_batch(bedrock_client, batch, prompt)
                all_analyses.append(batch_analysis)

            # Combine all analyses
            combined_analysis = "\n\n".join(all_analyses)

            # Save the analysis to a file
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = f"snow_analysis_{timestamp}.txt"
            with open(output_file, "w") as f:
                f.write(f"Snow Analysis for: {bucket_name}/{key_name}\n")
                f.write(f"Total Frames Analyzed: {len(frames)}\n")
                f.write(f"Analysis:\n{combined_analysis}\n")

            print(f"Snow analysis complete! Results saved to {output_file}")
            return combined_analysis

        finally:
            # Clean up temporary file
            if os.path.exists(temp_path):
                os.remove(temp_path)

    except Exception as e:
        print(f"Error: {str(e)}")
        raise



# Example usage
process_s3_video(
    bucket_name='motiverse-2025-data',
    key_name='videos/3043711272.mp4',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    batch_size=5  # Process 5 frames at a time
)