# Retrieval and RAG evaluation

To evauate retrieval and RAG, we use the [RAGAS](https://docs.ragas.io/en/latest/) library.

In [20]:
#!pip install ragas

Import libraries related to data processing and vector DB and secrets for external APIs.

In [21]:
import os
import random
import numpy as np
import pandas as pd
import numpy as np
from typing import List
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from sklearn.metrics.pairwise t time
import numpy as np
from openai import OpenAI
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key= os.environ.get('OPENAI_API_KEY')

# Setting up Vector DB
INDEX_NAME = "geo-kowledge-base"
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
# Check if the index exists before trying to create it
if not pc.has_index(INDEX_NAME):
    pc.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )  
else:
    print(f"Index {INDEX_NAME} already exists.")

# Wait for the index to be ready
while not pc.describe_index(INDEX_NAME).status['ready']:
    time.sleep(1)
    
index = pc.Index(INDEX_NAME)

Index geo-kowledge-base already exists.


In [22]:
df  = pd.read_csv('/workspaces/geospatial-chat/data/processed/train_data.csv')

In [23]:
df = df.drop('Unnamed: 0', axis=1)

In [24]:
df.head()

Unnamed: 0,question,area,text,id
0,1. What is a land survey?,surveying,A land survey is a report in the form of a map...,doc1134
1,2. Why are land surveys so important?,surveying,Prior to the closing on your new home purchase...,doc1133
2,3. How is a land survey performed?,surveying,The land surveyor's responsibility is to locat...,doc1132
3,4. Why have I been asked if I want property ma...,surveying,In the course of performing a survey in New Je...,doc1131
4,5. Why should I not waive the marking of prope...,surveying,Boundary lines shown on a survey map of the pr...,doc1130


In [25]:
client = OpenAI(api_key=openai_api_key)

In [26]:
def get_context_from_pinecone(user_query):
    model = SentenceTransformer("all-mpnet-base-v2")

    
    query_vector = model.encode(user_query).tolist()  
    results = index.query(vector=query_vector, top_k=1, include_metadata=True)

    context = []
    for match in results["matches"]:
        # Check if 'metadata' exist and y contains 'title'
        if "metadata" in match and "text" in match["metadata"]:
            context.append(match["metadata"]["text"])
        else:
            # in case the 'title' doesn exist, use the ID as fallback
            context.append(f"Document ID: {match['id']}")

    
    return context

In [27]:
def create_prompt(user_query, context):
    prompt = f"""
    You are an Surveying, Mapping and Geospatial Expert. Based on the following context, answer the user's query in detail.
    
    Context: {context}
    
    User Query: {user_query}
    
    Please provide a clear, concise, and informative answer.
    """
    return prompt

In [28]:
def generate_rag_response(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [29]:
# Seleccionar una muestra aleatoria de 20 filas (si tienes al menos 20)
sampled_df = df.sample(n=20, replace=True)  # Cambiar a False si no quieres reemplazo

# Agregar las columnas para retrieved_context y rag_response
sampled_df['retrieved_context'] = sampled_df.apply(lambda row: get_context_from_pinecone(row['question']), axis=1)
sampled_df['promt'] = sampled_df.apply(lambda row: create_prompt(row['question'],row['retrieved_context']), axis=1)
sampled_df['rag_response'] = sampled_df.apply(lambda row: generate_rag_response(row['promt']), axis=1)



In [30]:
sampled_df.head()

Unnamed: 0,question,area,text,id,retrieved_context,promt,rag_response
254,WHAT ARE THE DIFFERENT TYPES OF SURVEYS?,surveying,"Construction layout services, also known as co...",doc0880,[As-Built or Record Drawings:],"\n You are an Surveying, Mapping and Geospa...",There are several different types of surveys t...
851,How to Start Web Mapping?,"['web-mapping', 'references']",Penn State has an Open Web Mapping class. It s...,doc0283,[I want to start working on a web map at work ...,"\n You are an Surveying, Mapping and Geospa...","To start web mapping, you will need to follow ..."
491,What Is the Difference Between a Land Survey a...,other,Civil engineering,doc0643,[Civil engineering],"\n You are an Surveying, Mapping and Geospa...",A land survey typically involves determining a...
616,Are your condition and quality scores based on...,surveying,We have built standardized condition and quali...,doc0518,[Our proprietary R1R6 model is not based on a ...,"\n You are an Surveying, Mapping and Geospa...",The condition and quality scores in our R1R6 m...
488,What Is the Difference Between a Land Survey a...,surveying,Get Answers to Your Questions About Land Surve...,doc0646,[Civil engineering],"\n You are an Surveying, Mapping and Geospa...",A land survey typically involves determining a...


In [33]:
sampled_df['rag_response'] = sampled_df.apply(lambda row: cosine_similarity(row['text'], row['retrieved_context']), axis=1)

InvalidParameterError: The 'X' parameter of cosine_similarity must be an array-like or a sparse matrix. Got 'Construction layout services, also known as construction staking or site layout survey, establish and mark the precise location of proposed structures, utilities, and other infrastructure on a construction site. This crucial step ensures buildings are placed correctly according to design plans and minimizes errors and conflicts, reducing the risk of costly rework later in the construction process.' instead.

Converting data to ragas evaluation dataset