authors: amirimani@, arunpshankar@
date: 18/3/2024


----

# Setup and config

### Install required libraries


In [None]:
! pip3 install --upgrade --user -q google-cloud-aiplatform
! pip3 install --upgrade --user -q google-cloud-discoveryengine
! pip3 install --upgrade --user -q langchain

In [None]:
!pip freeze | grep google-cloud-discoveryengine

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Authenticate your notebook environment (Colab only)

In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# Define project information
PROJECT_ID = "amir-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

# Cloud storage work


In [None]:
from google.cloud import storage

In [None]:
from google.cloud import storage

def copy_html_files_between_prefixes(bucket_name, source_prefix, destination_prefix):
    """Copies all HTML files from one prefix to another in the same bucket.

    Args:
        bucket_name (str): Name of the Cloud Storage bucket.
        source_prefix (str): The prefix to copy files from.
        destination_prefix (str): The prefix to copy files to.
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Iterate through blobs with the source prefix
    blobs = bucket.list_blobs(prefix=source_prefix)
    for blob in blobs:
        if blob.name.endswith('.html'):
            # Construct the new destination path
            new_blob_name = destination_prefix + blob.name[len(source_prefix):]

            # Copy the blob
            copy_blob = bucket.copy_blob(blob, bucket, new_blob_name)
            print(f"Copied {blob.name} to {new_blob_name}")


In [None]:

# Replace with your bucket name and prefixes

bucket_name = "app-454763165029"
source_prefix = "ingestion/1712146440/"
destination_prefix = "ingestion/1712146440_html/"

copy_html_files_between_prefixes(bucket_name, source_prefix, destination_prefix)


# Metadata Processing

Check if the values in the the metdadata are standardized.*italicized text*

In [None]:
import pandas as pd
from tqdm import tqdm
from google.cloud import storage

In [None]:
def ingest_json_from_gcs(bucket_name, prefix):
    """
    Reads all JSON files within a specified prefix in a GCS bucket,
    combines them into a single Pandas DataFrame.

    Args:
        bucket_name (str): Name of the GCS bucket.
        prefix (str): Prefix to filter JSON files (e.g., 'data/').

    Returns:
        pandas.DataFrame: The combined DataFrame.
    """

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blobs = bucket.list_blobs(prefix=prefix)
    all_data = []

    for blob in blobs:
        if blob.name.endswith('.json'):
            json_str = blob.download_as_string().decode('utf-8')
            json_data = pd.read_json(json_str, lines=True)  # Read line-delimited JSON
            all_data.append(json_data)

    if all_data:
        df = pd.concat(all_data, ignore_index=True)
        return df
    else:
        print('No JSON files found in the specified location.')
        return None


In [None]:
bucket_name = 'app-454763165029'
# prefix = 'ingestion/1710168869/1710168869_datastore_ingest/'
prefix = 'ingestion/1712227118/1712227118_datastore_ingest'

In [None]:
df_meta = ingest_json_from_gcs(bucket_name, prefix)
df_meta.shape

In [None]:
# expand the dictionary column to individual columns
df_meta = pd.json_normalize(df_meta['structData'])

In [None]:
df_meta.columns

In [None]:
df_id_map = df_meta[["id", "article_version_id"]]
# df_id_map.to_csv('./id_map.csv', index=False)

In [None]:
all_sates = set(df_meta['state'].explode())
assert len(all_sates) == 50

In [None]:
state_abbreviations = [
    "AL",
    "AK",
    "AZ",
    "AR",
    "CA",
    "CO",
    "CT",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MD",
    "MA",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "ND",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VT",
    "VA",
    "WA",
    "WV",
    "WI",
    "WY",
]

[x for x in all_sates if x not in state_abbreviations]

In [None]:
all_brands = set(df_meta['brand'].explode())
all_brands

In [None]:

# only keep "bristol west" documents
mask = df_meta['brand'].apply(lambda x: "Bristol_West" in x)
df_meta_filtered = df_meta[mask]

print(df_meta_filtered.shape, df_meta.shape)


In [None]:
def copy_filtered_pdfs(row, destination_prefix):
  storage_client = storage.Client()


  bucket_name = "app-454763165029"

  bucket = storage_client.bucket(bucket_name)

  source_bucket_name, _, source_prefix = row["output_ingestion"].replace('gs://', '').partition('/')
  blob_name = row["id"] + '.html'

  source_blob = bucket.blob(source_prefix + "/html/" + blob_name)
  destination_blob_name = destination_prefix + blob_name

  # print(source_blob)
  bucket.copy_blob(source_blob, bucket, destination_blob_name)

In [None]:

df_meta_filtered[3327:].shape

In [None]:
# copy filtered files to a new bucket
for idx, row in tqdm(df_meta_filtered.iterrows(), total=df_meta_filtered.shape[0]):
  copy_filtered_pdfs(row,   destination_prefix = "bristol_west_html/")

### Metadata summary:

* **state**: make a decision on ["Could not find value 'All' for column 'states' in translation map.", nan]
* **brand**: make a decision on ["None, nan]

# Get the Answers from VAIS



In [None]:
from google.cloud import discoveryengine_v1beta as discoveryengine
from google.api_core.client_options import ClientOptions
from google.protobuf import json_format
from google.cloud import storage


import pandas as pd
from tqdm import tqdm
from typing import Optional, Dict, Any

# from IPython.display import Markdown, display

In [None]:
# df_test = pd.read_csv("./approach-3-fail-cases - approach-3-fail-cases.csv")
# df_test = pd.read_csv("./output_21032024.csv")
df_test = pd.read_csv('./Farmers KM Search 14Feb - Sheet1.csv')
df_test.head()

# Only select "Bristol_West" rows
df_test = df_test[df_test['brand']=="Bristol West"].reset_index(drop=True)
df_test["brand"] = "Bristol_West"
df_test.shape

In [None]:
df_test.head()

In [None]:
def search_data_store(search_query: str) -> Optional[discoveryengine.SearchResponse]:
    """
    Search the data store using Google Cloud's Discovery Engine API.

    Args:
        search_query (str): The search query string.
        filter_str (str): Filter string for the query.

    Returns:
        discoveryengine.SearchResponse: The search response from the Discovery Engine API.
    """

    # config
    location = "us"
    project_id="454763165029"
    # engine_id="app-data-store"
    engine_id="farmers-bw-html_1712268336460"

    n_doc = 10

    try:
        client_options = (
            ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
            if location != "global"
            else None
        )

        client = discoveryengine.SearchServiceClient(client_options=client_options)

        serving_config = client.serving_config_path(
            project=project_id,
            location=location,
            data_store=engine_id,
            serving_config="default_config",
        )

        content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
            snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
                return_snippet=True,
                max_snippet_count=10
            ),
            extractive_content_spec=discoveryengine.SearchRequest.ContentSearchSpec.ExtractiveContentSpec(
                max_extractive_answer_count=n_doc,
                max_extractive_segment_count=n_doc,
            ),
            summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
                summary_result_count=10,
                include_citations=True,
                ignore_adversarial_query=False,
                ignore_non_summary_seeking_query=False,
                language_code="en",
                model_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec.ModelSpec(
                                    version="text-bison@002/answer_gen/v1"
                                                ),
                # use_semantic_chunks=True

            ),

        )

        request = discoveryengine.SearchRequest(
            serving_config=serving_config,
            query=search_query,
            # filter=filter_str,
            page_size=n_doc,
            content_search_spec=content_search_spec,
            query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
                condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
            ),
            spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
                mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
            ),
        )

        response = client.search(request)
        return response

    except Exception as e:
        print(f"Error during data store search: {e}")
        return None

def get_gs_links(search_response):

  id_list = []
  article_version_id_list = []

  for r in search_response.results:
    # Convert protocol buffer message to JSON
    result_json = json_format.MessageToDict(r.document._pb)
    # l.append(result_json['derivedStructData']['link'])

    ## for some reason new config doesn't have this
    # id_list.append(result_json["structData"]['id'])
    # article_version_id_list.append(result_json["structData"]["article_version_id"])

    id_list.append(result_json['derivedStructData']['link'])
  return id_list



### Check if VAIS returns the right URL in the top 3 answers

In [None]:
def vais_text(search_response):

  c = []
  for r in search_response.results:
    # Convert protocol buffer message to JSON
    result_json = json_format.MessageToDict(r.document._pb)
    c.append(result_json["derivedStructData"])

  return c

In [None]:

all_links = []
version_id_list = []
extractive_content_list = []
summary_list = []

for idx, row in tqdm(df_test.iterrows(), total=df_test.shape[0]):

  search_query = row["question"]

  hits = search_data_store(search_query)
  summary_list.append(hits.summary.summary_text)
  # pdf_links, version_id = get_gs_links(hits)
  pdf_links = get_gs_links(hits)

  all_links.append(pdf_links)
  # version_id_list.append(version_id)

  extractive_content_list.append(vais_text(hits))


df_test['VAIS_links_cleaned'] = all_links
# df_test['VAIS_version_id'] = version_id_list

df_test['vais_ans'] = extractive_content_list

In [None]:
df_test["vais_summary"] = summary_list

In [None]:
# this is now required with the html files!!!!!!
import os

df_map = pd.read_csv('./id_map__280302024.csv')


def get_version_id(row, id_map=df_map):
  version_id_list = []
  for n in row:
    basename = os.path.basename(n)
    base_name = os.path.splitext(basename)[0]

    version_id = df_map['article_version_id'][df_map['id'] == base_name].values[0]
    version_id_list.append(version_id)
  return version_id_list


df_test["VAIS_doc_title"] = df_test["VAIS_links_cleaned"].apply(get_version_id)

In [None]:
df_test.head()

In [None]:
# check if expected ID exists in the top 3 answers from VAIS
def check_if_exists(row):
    return row['expected_id'] in row['VAIS_doc_title']

df_test["VAIS_contains_expected"] = False

# Rows where the exptected ID is in top 3 VAIS results
idx_contains_retrieval = df_test[df_test.apply(check_if_exists, axis=1)].index
df_test.loc[idx_contains_retrieval, "VAIS_contains_expected"] = True
df_test[df_test["VAIS_contains_expected"]].shape

In [None]:
# get top result from VAIS
df_test["VAIS_top_doc"] = df_test["VAIS_doc_title"].apply(lambda x: x[0] if x else "")

idx_correct_retrieval = df_test[df_test["VAIS_top_doc"]==df_test["expected_id"]].index
df_test["VAIS_equals_expected"] = False

# Rows where expeted ID is the top result from VAIS
df_test.loc[idx_correct_retrieval, "VAIS_equals_expected"] = True

df_test[df_test["VAIS_equals_expected"]].shape

In [None]:
df_test.head()

In [None]:

df_test[df_test["VAIS_contains_expected"]].shape

# Gary's VAIS

https://colab.sandbox.google.com/drive/1JuW-HVKhVw0tjDtylAKXMcC3FYGkbths#scrollTo=cC5Qk37HLe9J

In [None]:
!pip install colabtools

In [None]:
# @title Allow the colab to use your GCP authentication
import requests
import json
import pandas as pd

from google.colab import auth
from google.auth import default
import google.auth.transport.requests
auth.authenticate_user()
creds, _ = google.auth.default()
auth_req = google.auth.transport.requests.Request()
import pprint
creds.refresh(auth_req)

In [None]:
!ls

In [None]:
df = pd.read_csv('Farmers KM Search 14Feb - New 100BW From Gary & Anindo.csv')

In [None]:
def get_result(query, engine):

  project_id = 'code-401909'
  # app-data-store, global-ds-app_1712681008775
  # data_store_id = 'app-data-store'
  data_store_id = engine
  region = 'global'
  preamble = ''

  model_version = 'gemini-1.0-pro-001/answer_gen/v1' # text-bison@002/answer_gen/v1, gemini-1.0-pro-001/answer_gen/v1
  Language = 'en'

  api_engine = 'discoveryengine.googleapis.com'
  if region == 'us':
    api_engine = 'us-discoveryengine.googleapis.com'

  resp = requests.post(
    f'https://{api_engine}/v1alpha/projects/{project_id}/locations/{region}/collections/default_collection/dataStores/{data_store_id}/servingConfigs/default_search:search',
    headers={
      'Content-Type': 'application/json',
      'Authorization': 'Bearer ' + creds.token,
    },
    json={
      "servingConfig": "projects/{project_id}/locations/{region}/collections/default_collection/dataStores/{data_store_id}/servingConfigs/default_search",
      "contentSearchSpec": {
        "snippetSpec": {
          "max_snippet_count": 3,
        },
        "summarySpec": {
          "summaryResultCount": 5,
          "languageCode":Language,
          "includeCitations": True,
          "modelSpec": { "version": model_version },
          "modelPromptSpec":{ "preamble": preamble },
          "useSemanticChunks": True,
        },
      },
      "query": query,
      "pageSize": 10,
      # (brand: ANY("Bristol_West")) AND (state: ANY("WA"))
      # "brand: ANY(\"Bristol_West\")"
      "filter": "brand: ANY(\"Bristol_West\")"
    },
  )


  # Output the result
  print('Search Resutls')

  article_id = []

  for i in range(10):
    article_id.append(resp.json()['results'][i]['document']['id'])

  gen_answer = resp.json()['summary']['summaryText']


  return article_id, gen_answer

In [None]:
vais_id = []
vais_answer =[]


engines = ["global-ds-app_1712681008775",
          #  "global-ds-app-chunk_1712681551374"
           ]

for idx, row in df.head(5).iterrows():

  for e in engines:
    print(row["question"])
    v_id, v_ans = get_result(row["question"], e)
    vais_id.append(v_id)
    vais_answer.append(v_ans)
    print(vais_answer)

## Comparing Gary's VAIS result to GT

In [None]:
!pip install rouge_score

In [None]:
import pandas as pd
# from rouge_score import rouge_scorer

from vertexai.generative_models import GenerationConfig, GenerativeModel


In [None]:
df = pd.read_csv('Farmers KM Search 14Feb - [tuned] New 100BW From global-ds-app_1712681008775.csv')
df = df.loc[:99]
df.head()

Unnamed: 0,Serviced By,Question,State,Question\nWord Count,Expected Article,Expected Article ID,Expected Article ID Link,Expected Answer,question,GCP search_ids,Found\nexpected_id_Pos,Search_Loss_Reason,VAIS Answer,Answer Correctness
0,Bristol_West,How do I fix Policy EFT Error Missing required...,,10.0,"New business quote error ""Policyholder EFT Err...",kAD4T000000PC7YWAW,https://storage.mtls.cloud.google.com/app-4547...,Launch Rate tab\n Select No from drop down men...,How do I fix Policy EFT Error Missing required...,kAD4T000000PC7YWAW kAD1L000000TNndWAG kAD1L000...,,,"If you receive a ""Policyholder EFT Error: Miss...",
1,Bristol_West,"What do I do with the Policy EFT Error, Missin...",CA,12.0,"New business quote error ""Policyholder EFT Err...",kAD4T000000PC7YWAW,https://storage.mtls.cloud.google.com/app-4547...,Launch Rate tab\n Select No from drop down men...,"What do I do with the Policy EFT Error, Missin...",kAD4T000000PC7YWAW kAD1L000000fxhbWAA kAD1L000...,,,If you receive a Policy EFT Error indicating m...,
2,Bristol_West,what does posted not charged mean,FL,6.0,DE102864 - Payments posted not charged,kAD4T000000GsLiWAK,https://storage.mtls.cloud.google.com/app-4547...,"Symptom:1,926 policies were impacted by defect...",what does posted not charged mean,kAD4T000000GsLiWAK kAD4T00000000S0WAI kAD1L000...,,,"""Posted not charged"" means that a payment has ...",
3,Bristol_West,how much is the NSF charge in Washington?,WA,8.0,Washington Billing Fees,kAD4T000000GpiAWAS,https://storage.mtls.cloud.google.com/app-4547...,25$,how much is the NSF charge in Washington?,kAD4T000000GpiAWAS kAD1L000000fzksWAA kAD1L000...,,,The NSF charge in Washington is $25. This fee ...,
4,Bristol_West,What do I do if a caller requests the Farmers ...,,12.0,How to Handle a Caller's Request for the Farme...,kAD4T000000GmziWAC,https://storage.mtls.cloud.google.com/app-4547...,a caller requests for a copy of the Farmers Do...,What do I do if a caller requests the Farmers ...,kAD4T000000GmziWAC kAD1L000000g07MWAQ kAD1L000...,,,"If a caller requests the Farmers DNC Policy, y...",


In [None]:
df.tail()

Unnamed: 0,Serviced By,Question,State,Question\nWord Count,Expected Article,Expected Article ID,Expected Article ID Link,Expected Answer,question,GCP search_ids,Found\nexpected_id_Pos,Search_Loss_Reason,VAIS Answer,Answer Correctness
95,Bristol_West,Can I clear an SP02 Error?,,6.0,How can I clear SP02 Error?,,,Yes- Submit a QC defect Request,Can I clear an SP02 Error?,kAD1L000000fz3gWAA kAD1L000000fxbtWAA kAD1L000...,,,"I'm sorry, but I cannot answer your question b...",
96,Bristol_West,Do Michigan benefits qualify for QHC in VA?,,8.0,,,,No Article found with this topic. Expecting ge...,Do Michigan benefits qualify for QHC in VA?,kAD1L000000gDNWWA2 kAD4T000000XbUNWA0 kAD1L000...,,,"No, VA benefits do not qualify for QHC in Virg...",
97,Bristol_West,Is Advanced Purchase Discount available in MX ?,,8.0,,,,No Article found with this topic. Expecting ge...,Is Advanced Purchase Discount available in MX ?,kAD1L000000PBaOWAW kAD1L000000g2Y7WAI kAD1L000...,,,"Yes, Advance Purchase Discount is available in...",
98,Bristol_West,Is Agent 0498746 restricted from writing Highe...,,8.0,,,,Not Found,Is Agent 0498746 restricted from writing Highe...,kAD1L00000000FqWAI kAD1L00000000LvWAI kAD1L000...,,,This question cannot be answered from the give...,
99,Bristol_West,What is the mid term cancellation fee in TX ?,,10.0,,,,There should be no answer to this.,What is the mid term cancellation fee in TX ?,kAD1L000000g3PfWAI kAD1L000000g7ENWAY kAD4T000...,,,There is no cancellation fee for mid-term canc...,


In [None]:

df["GCP search_ids"] = df["GCP search_ids"].apply(lambda x: x.split(" "))

In [None]:
def find_location(target_e, source_l):
  l =  [i for i, e in enumerate(source_l) if e == target_e]
  if l:
    l = l[0] + 1
  else:
    l = ""
  return l

df["Found\nexpected_id_Pos"] = df.apply(lambda x: find_location(x["Expected Article ID"], x["GCP search_ids"]), axis=1)

In [None]:

df.to_csv('location.csv')

In [None]:
# rouge score

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)


rouge_score = []

for idx, row in df.iterrows():
  rouge_score.append(scorer.score(row['Expected Answer'], row["VAIS Answer"]))


In [None]:

# RAgas

df_eval = df[["Question", "Expected Answer", "VAIS Answer"]]
df_eval.columns = ['question', 'ground_truth', 'answer']


# dataset = Dataset.from_pandas(df_eval[['question', 'answer', 'ground_truth']])

# # load LLM
# ragas_llm = VertexAI(model_name="text-bison")
# # Load embeddings
# ragas_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

In [None]:
score = evaluate(dataset,
                 metrics=[answer_similarity],
                 llm=ragas_llm,
                 embeddings=ragas_embeddings)


df_eval = score.to_pandas()

# df_eval.to_csv('eval_ragas.csv')

In [None]:
# Gemini
from langchain_core.prompts import PromptTemplate

# Instantiation using from_template (recommended)
template = """
You are an experienced customer support agent. Compare the answer given to a customer with the source of truth.

-  Take your time to evaluate the quality of the answer. You should read everything first then compare the ansewrs.
- If the answer is has all the information from the source of truth, it is a high quality answer. otherwise,  flag any discrepencies and provide your reason and the text of discprencies.
- Do not try to make up an answer

==============
ANSWER:
{answer}
==============

==============
SOURCE OF TRUTH:
{source}
==============

Result:


"""

prompt = PromptTemplate(template=template, input_variables=["answer", "source"])


model = GenerativeModel("gemini-1.0-pro")
generation_config = GenerationConfig(temperature=0.2)



In [None]:
gemini_resp = []
for idx, row in df_eval.iterrows():
  full_prompt = prompt.format(answer=row["answer"], source=row["ground_truth"])

  responses = model.generate_content(full_prompt, stream=False)
  gemini_resp.append(responses.text)

In [None]:
df_eval["gemini"] = gemini_resp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eval["gemini"] = gemini_resp


In [None]:
df_eval.to_csv('eval.csv')

In [None]:

df_eval

Unnamed: 0,question,ground_truth,answer,gemini
0,How do I fix Policy EFT Error Missing required...,Launch Rate tab\n Select No from drop down men...,"If you receive a ""Policyholder EFT Error: Miss...",The answer provided to the customer is high qu...
1,"What do I do with the Policy EFT Error, Missin...",Launch Rate tab\n Select No from drop down men...,If you receive a Policy EFT Error indicating m...,The answer is a high quality answer because it...
2,what does posted not charged mean,"Symptom:1,926 policies were impacted by defect...","""Posted not charged"" means that a payment has ...",**Flag for discrepancies.**\n\nThe answer prov...
3,how much is the NSF charge in Washington?,25$,The NSF charge in Washington is $25. This fee ...,The answer is of high quality because it has a...
4,What do I do if a caller requests the Farmers ...,a caller requests for a copy of the Farmers Do...,"If a caller requests the Farmers DNC Policy, y...",The answer is of high quality. It contains all...
...,...,...,...,...
95,Can I clear an SP02 Error?,Yes- Submit a QC defect Request,"I'm sorry, but I cannot answer your question b...",Discrepencies:\n\n- The answer states that the...
96,Do Michigan benefits qualify for QHC in VA?,No Article found with this topic. Expecting ge...,"No, VA benefits do not qualify for QHC in Virg...",I am unable to evaluate the quality of the ans...
97,Is Advanced Purchase Discount available in MX ?,No Article found with this topic. Expecting ge...,"Yes, Advance Purchase Discount is available in...",The answer provided is not of high quality. Th...
98,Is Agent 0498746 restricted from writing Highe...,Not Found,This question cannot be answered from the give...,The answer provided correctly states that the ...


# Error Analysis from VAIS

In [None]:
!pip install --upgrade --quiet langchain langchain-google-vertexai
!pip3 install --upgrade --user -q ragas

In [None]:
import ragas
from ragas import evaluate
from ragas.metrics import answer_similarity
from langchain_google_vertexai import VertexAIEmbeddings, VertexAI

from datasets import Dataset


In [None]:
# check the results for the `VAIS_equals_expected`

# df_exact_matches = df_test[df_test["VAIS_equals_expected"]]
df_exact_matches = df
print(f"number of exact matches: {df_exact_matches.shape[0]}")
df_exact_matches = df_exact_matches[["question", "expected_ans", "vais_summary"]]
df_exact_matches.columns = ['question', 'ground_truth', 'answer']
df_exact_matches.head()


In [None]:
dataset = Dataset.from_pandas(df_exact_matches[['question', 'answer', 'ground_truth']])

# load LLM
ragas_llm = VertexAI(model_name="text-bison")
# Load embeddings
ragas_embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@001")

In [None]:
score = evaluate(dataset,
                 metrics=[answer_similarity],
                 llm=ragas_llm,
                 embeddings=ragas_embeddings)

In [None]:
df_eval = score.to_pandas()
# df_eval = df_eval.drop(columns=["__index_level_0__"])

df_eval.index = df_exact_matches.index

# df_eval.to_csv('./vais_exact_matches__26032024.csv', index=False)

In [None]:
df_eval.head()

In [None]:
df_test = df_test.merge(df_eval[["answer_similarity"]], left_index=True, right_index=True, how="outer")
df_test.shape

In [None]:
df_test.to_csv('./vais_html__04042024.csv', index=False)

In [None]:
df_test[df_test["answer_similarity"] > 0.7]

In [None]:
df_no_match = df_test[df_test["VAIS_equals_expected"]==False]
df_no_match[df_no_match["VAIS_contains_expected"]]


**Summary**

- VAIS console doesn't have `filter` -> the result is different from using the SDK
- out of 92, 36 rows are related to Bristol Web
- 18 out of 36 have the exact same expected ID as the top result from VAIS (column `VAIS_equals_expected`)
   - comparing the result from those 18 to expeted_answer (GT), average semantic similarity is ~80%.[link](https://docs.google.com/spreadsheets/d/1QBNPth0xDgSpM4v9hKV4SkbZySpIJ519JI0Jb62h7f4/edit#gid=1171667870)
   - I have checked 6 answers that have semantic similarity below 0.8. 3 of them returend no answer at all (issue with VAIS retrieval). Here are the other 3 questions and my response:
       - `Is Agent 0470914 restricted from writing Higher Limits?` the answer is correct. VAIS returned a verbose answer while GT is `yes`
       - `What agents are restricted from writing higher limits?`answer is partially returned (due to filters) while on the console is correct
       - `What Tasks does the Service Assist Program assist with?` VAIS answer is correct more complete
  

- out of 18 questions without exact VAIS match, only 4 returned a document withhin top 3 results -> makese sense to keep it to only top answer


# Querying using Gemini + Grounding

In [None]:

!pwd

In [None]:
# Define project information
PROJECT_ID = "amir-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [None]:
import pandas as pd
from tqdm import tqdm

# Initialize Vertex AI
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Tool, grounding

from IPython.display import display, Markdown


vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
model = GenerativeModel("gemini-1.5-pro-preview-0215")
# model = GenerativeModel("gemini-1.0-pro")

In [None]:
gemini_ans_list = []


location = "us"
project_id="454763165029"
engine_id="app-data-store"

datastore = f"projects/{project_id}/locations/{location}/collections/default_collection/dataStores/{engine_id}"

for idx, row in tqdm(df_test.iterrows(), total=df_test.shape[0]):

  tool = Tool.from_retrieval(
      grounding.Retrieval(grounding.VertexAISearch(datastore=datastore))
  )

  response = model.generate_content(row["question"], tools=[tool])

  gemini_ans_list.append(response.candidates[0])


In [None]:
gemini_uri_list = []
gemini_cs_list = []
gemini_text_list = []
for r in gemini_ans_list:
  grounding_attr = r.to_dict()['grounding_metadata']['grounding_attributions']
  if not grounding_attr:
    gemini_text = []
    gemini_uri = []
    gemini_cs = []
  else:
    try:
      gemini_text = r.text
    except:
      gemini_text = []
    gemini_uri = grounding_attr[0]['web']['uri']
    gemini_cs = grounding_attr[0]['confidence_score']

  gemini_text_list.append(gemini_text)
  gemini_cs_list.append(gemini_cs)
  gemini_uri_list.append(gemini_uri)

In [None]:
df_test["gemini_ans"] = gemini_text_list
df_test["gemini_id"] = gemini_uri_list
df_test["gemini_cs"] = gemini_cs_list

In [None]:
df_test['gemini_id_cleaned'] = df_test['gemini_id'].apply(lambda x : x.rsplit('/')[-1].rsplit('.pdf')[0] if len(x) > 0 else "")

In [None]:
df_test["VAIS_equals_gemini"] = df_test["VAIS_links_cleaned_top"] == df_test["gemini_id_cleaned"]

In [None]:
# display(Markdown(response.candidates[0].text))
df_test.head()

In [None]:
df_test.to_csv('./output_25032024.csv', index=False)

# DIY RAG


https://github.com/Arize-ai/LLMTest_NeedleInAHaystack/blob/8e6b92e9a1e9b8b1db9a990a6fe4d4210ea7219d/README.md

In [None]:
!pip install arize-phoenix[evals]

In [None]:
from transformers import T5Tokenizer
from phoenix.evals.models.vertex import GeminiModel

import vertexai.preview
from google.cloud import aiplatform


In [None]:
# Define project information
PROJECT_ID = "amir-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

model_name = "gemini-1.5-pro-preview-0409"

In [None]:
enc = T5Tokenizer.from_pretrained("google-t5/t5-11b")

In [None]:
GEMINI_TEMPLATE2 = '''
    <context>
    {context}
    </context>
    {question} Don't give information outside the document or repeat your findings.
    Here is the magic number from the context:

    '''
    #{question} You are looking for a number from the context. Don't give information outside the document or repeat your findings


In [None]:
model = GeminiModel(model_name)
model

In [None]:
context_lengths_min = 100
context_lengths_max = 110000

  def read_context_files(self):
      context = ""
      max_context_length = max(self.context_lengths)

      while self.get_context_length_in_tokens(context) < max_context_length:
          for file in glob.glob(f"{self.haystack_dir}/*.txt"):
              with open(file, 'r') as f:
                  context += f.read()
      return context

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Farmers KM Search 14Feb - New Top1 100BW 8th April.csv')

In [None]:
type(df["Expected Article ID"][1])

In [None]:
def add_html_link(article_id):

  if isinstance(article_id, str):
    return f"https://storage.mtls.cloud.google.com/app-454763165029/ingestion/1712146440_html/{article_id}.html"
  else:
    return ""

In [None]:
df["Expected Article ID Link"] = df["Expected Article ID"].apply(add_html_link)

In [None]:

df["Expected Article ID Link"]

In [None]:
df.to_csv('link.csv')

In [None]:
!pip install validators

In [None]:
from google.cloud import storage

def list_blobs_with_prefix(bucket_name, prefix, delimiter=None):
    """Lists all the blobs in the bucket that begin with the prefix.

    This can be used to list all blobs in a "folder", e.g. "public/".

    Args:
        bucket_name: The name of your Cloud Storage bucket.
        prefix: The prefix used to filter objects.
        delimiter: (Optional) Delimiter to treat as a directory structure.
    """
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)

    return [i.name.rsplit('/')[-1].split('.')[0] for i in blobs]

# Example usage
bucket_name = 'app-454763165029'
prefix = 'ingestion/1712146440_html/'
all_blobs = list_blobs_with_prefix(bucket_name, prefix)

In [None]:
unique_elements = [item for item in df['Expected Article ID'].to_list() if item not in all_blobs]

In [None]:

unique_elements

In [None]:
# df['Expected Article ID'].str.strip()