# Retrieval and RAG evaluation

To evauate retrieval and RAG, we use the [RAGAS](https://docs.ragas.io/en/latest/) library.

In [80]:
#!pip install ragas

Import libraries related to data processing and vector DB and secrets for external APIs.

In [81]:
import os
import random
import numpy as np
import pandas as pd
import numpy as np
from typing import List
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key= os.environ.get('OPENAI_API_KEY')

# Setting up Vector DB
INDEX_NAME = "geo-kowledge-base"
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)
# Check if the index exists before trying to create it
if not pc.has_index(INDEX_NAME):
    pc.create_index(
        name=INDEX_NAME,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )  
else:
    print(f"Index {INDEX_NAME} already exists.")

# Wait for the index to be ready
while not pc.describe_index(INDEX_NAME).status['ready']:
    time.sleep(1)
    
index = pc.Index(INDEX_NAME)

Index geo-kowledge-base already exists.


In [82]:
df  = pd.read_csv('/workspaces/geospatial-chat/data/processed/train_data.csv')

In [83]:
df = df.drop('Unnamed: 0', axis=1)

In [84]:
df.head()

Unnamed: 0,question,area,text,id
0,1. What is a land survey?,surveying,A land survey is a report in the form of a map...,doc1134
1,2. Why are land surveys so important?,surveying,Prior to the closing on your new home purchase...,doc1133
2,3. How is a land survey performed?,surveying,The land surveyor's responsibility is to locat...,doc1132
3,4. Why have I been asked if I want property ma...,surveying,In the course of performing a survey in New Je...,doc1131
4,5. Why should I not waive the marking of prope...,surveying,Boundary lines shown on a survey map of the pr...,doc1130


In [85]:
client = OpenAI(api_key=openai_api_key)

In [86]:
def get_context_from_pinecone(user_query):
    model = SentenceTransformer("all-mpnet-base-v2")

    
    query_vector = model.encode(user_query).tolist()  
    results = index.query(vector=query_vector, top_k=1, include_metadata=True)

    context = []
    for match in results["matches"]:
        # Check if 'metadata' exist and y contains 'title'
        if "metadata" in match and "text" in match["metadata"]:
            context.append(match["metadata"]["text"])
        else:
            # in case the 'title' doesn exist, use the ID as fallback
            context.append(f"Document ID: {match['id']}")

    
    return context

In [87]:
def create_prompt(user_query, context):
    prompt = f"""
    You are an Surveying, Mapping and Geospatial Expert. Based on the following context, answer the user's query in detail.
    
    Context: {context}
    
    User Query: {user_query}
    
    Please provide a clear, concise, and informative answer.
    """
    return prompt

In [88]:
def generate_rag_response(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [112]:
# Función para calcular la similitud coseno entre el texto y el contexto recuperado
def get_cosine_similarity(text, retrieved_context):
    model = SentenceTransformer("all-mpnet-base-v2")
    
    # Codificar cada uno por separado
    text_vector = model.encode(text)
    context_vector = model.encode(retrieved_context)

    # Asegurarse de que ambos vectores sean 2D
    text_vector = text_vector.reshape(1, -1)
    context_vector = context_vector.reshape(1, -1)
    
    # Calcular similitud coseno
    metric_cosine = cosine_similarity(text_vector, context_vector)[0][0]  # Extraer valor escalar
    
    return metric_cosine

In [90]:
# Seleccionar una muestra aleatoria de 20 filas (si tienes al menos 20)
sampled_df = df.head(5)  # Cambiar a False si no quieres reemplazo

# Agregar las columnas para retrieved_context y rag_response
sampled_df['retrieved_context'] = sampled_df.apply(lambda row: get_context_from_pinecone(row['question']), axis=1)
sampled_df['promt'] = sampled_df.apply(lambda row: create_prompt(row['question'],row['retrieved_context']), axis=1)
sampled_df['rag_response'] = sampled_df.apply(lambda row: generate_rag_response(row['promt']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df['retrieved_context'] = sampled_df.apply(lambda row: get_context_from_pinecone(row['question']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df['promt'] = sampled_df.apply(lambda row: create_prompt(row['question'],row['retrieved_context']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returni

In [91]:
sampled_df.head()

Unnamed: 0,question,area,text,id,retrieved_context,promt,rag_response
0,1. What is a land survey?,surveying,A land survey is a report in the form of a map...,doc1134,[Importance of Land Surveys],"\n You are an Surveying, Mapping and Geospa...",A land survey is a detailed study and measurem...
1,2. Why are land surveys so important?,surveying,Prior to the closing on your new home purchase...,doc1133,[Importance of Land Surveys],"\n You are an Surveying, Mapping and Geospa...",Land surveys are incredibly important for a va...
2,3. How is a land survey performed?,surveying,The land surveyor's responsibility is to locat...,doc1132,[Only a licensed surveyor can produce an offic...,"\n You are an Surveying, Mapping and Geospa...",A land survey is typically performed by a lice...
3,4. Why have I been asked if I want property ma...,surveying,In the course of performing a survey in New Je...,doc1131,[In the course of performing a survey in New J...,"\n You are an Surveying, Mapping and Geospa...","In New Jersey, State law requires land surveyo..."
4,5. Why should I not waive the marking of prope...,surveying,Boundary lines shown on a survey map of the pr...,doc1130,[Boundary lines shown on a survey map of the p...,"\n You are an Surveying, Mapping and Geospa...","As a Surveying, Mapping, and Geospatial Expert..."


In [113]:
sampled_df['cosine_similarity'] = sampled_df.apply(
    lambda row: get_cosine_similarity(row['text'], row['retrieved_context']), 
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sampled_df['cosine_similarity'] = sampled_df.apply(


In [114]:
sampled_df.head()

Unnamed: 0,question,area,text,id,retrieved_context,promt,rag_response,cosine_similarity
0,1. What is a land survey?,surveying,A land survey is a report in the form of a map...,doc1134,[Importance of Land Surveys],"\n You are an Surveying, Mapping and Geospa...",A land survey is a detailed study and measurem...,0.688263
1,2. Why are land surveys so important?,surveying,Prior to the closing on your new home purchase...,doc1133,[Importance of Land Surveys],"\n You are an Surveying, Mapping and Geospa...",Land surveys are incredibly important for a va...,0.737694
2,3. How is a land survey performed?,surveying,The land surveyor's responsibility is to locat...,doc1132,[Only a licensed surveyor can produce an offic...,"\n You are an Surveying, Mapping and Geospa...",A land survey is typically performed by a lice...,0.660505
3,4. Why have I been asked if I want property ma...,surveying,In the course of performing a survey in New Je...,doc1131,[In the course of performing a survey in New J...,"\n You are an Surveying, Mapping and Geospa...","In New Jersey, State law requires land surveyo...",1.0
4,5. Why should I not waive the marking of prope...,surveying,Boundary lines shown on a survey map of the pr...,doc1130,[Boundary lines shown on a survey map of the p...,"\n You are an Surveying, Mapping and Geospa...","As a Surveying, Mapping, and Geospatial Expert...",1.0


Converting data to ragas evaluation dataset