In [None]:
# Install some required packages

!pip install pypdf2
!pip install google-cloud-aiplatform
!pip install google-cloud-storage

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m163.8/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [None]:
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
import PyPDF2

import re
import os
import random
import json
import uuid

In [None]:
%ls

stats.pdf


In [None]:
# Initialize some variable

# project="your_GCP_project_id"
location="us-central1"

pdf_path="stats.pdf"
bucket_name = "stats-content2024"
embed_file_path = "stats_embeddings.json"
sentence_file_path = "stats_sentences.json"
index_name="stats_index"

In [None]:
# helper

def extract_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text() + " "
    sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
    return sentences

In [None]:
def generate_text_embeddings(sentences) -> list:
  # aiplatform.init(project=project,location=location)
  model = TextEmbeddingModel.from_pretrained("text-embedding-005")
  embeddings = model.get_embeddings(sentences)
  vectors = [embedding.values for embedding in embeddings]
  return vectors

In [None]:
def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
    def clean_text(text):
        cleaned_text = re.sub(r'\u2022', '', text)  # Remove bullet points
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespaces and strip
        return cleaned_text

    sentences = extract_sentences_from_pdf(pdf_path)
    if sentences:
        embeddings = generate_text_embeddings(sentences)

        with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
            for sentence, embedding in zip(sentences, embeddings):
                cleaned_sentence = clean_text(sentence)
                id = str(uuid.uuid4())

                embed_item = {"id": id, "embedding": embedding}
                sentence_item = {"id": id, "sentence": cleaned_sentence}

                json.dump(sentence_item, sentence_file)
                sentence_file.write('\n')
                json.dump(embed_item, embed_file)
                embed_file.write('\n')

In [None]:
def upload_file(bucket_name,file_path):
    storage_client = storage.Client()
    bucket = storage_client.create_bucket(bucket_name,location=location)
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_path)

In [None]:
def create_vector_index(bucket_name, index_name):
    lakeside_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = index_name,
    contents_delta_uri = "gs://"+bucket_name,
    dimensions = 768,
    approximate_neighbors_count = 10,
    )

    lakeside_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = index_name,
    public_endpoint_enabled = True
    )

    lakeside_index_endpoint.deploy_index(
    index = lakeside_index, deployed_index_id = index_name
    )

In [None]:
generate_and_save_embeddings(pdf_path,sentence_file_path,embed_file_path)
upload_file(bucket_name,sentence_file_path)

In [None]:
create_vector_index(bucket_name, index_name)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/417082480229/locations/us-central1/indexes/8364734819422175232/operations/2670255350598336512
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex created. Resource name: projects/417082480229/locations/us-central1/indexes/8364734819422175232
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:To use this MatchingEngineIndex in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:index = aiplatform.MatchingEngineIndex('projects/417082480229/locations/us-central1/indexes/8364734819422175232')
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Creating MatchingEngineIndexEndpoint
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Create MatchingEngineI

In [None]:
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part
import json
import os

In [None]:
# project=”YOUR_GCP_PROJECT”
location="us-central1"
sentence_file_path = "stats_sentences.json"
index_name="stats_index" #Get this from the console or the previous step

In [None]:
# aiplatform.init(project=project,location=location)
# vertexai.init()
model = GenerativeModel("gemini-1.5-pro-002")
lakeside_index_ep = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name="7645700394345037824")

In [None]:
def generate_text_embeddings(sentences) -> list:
    model = TextEmbeddingModel.from_pretrained("text-embedding-005")
    embeddings = model.get_embeddings(sentences)
    vectors = [embedding.values for embedding in embeddings]
    return vectors


def generate_context(ids,data):
    concatenated_names = ''
    for id in ids:
        for entry in data:
            if entry['id'] == id:
                concatenated_names += entry['sentence'] + "\n"
    return concatenated_names.strip()



def load_file(sentence_file_path):
  data = []
  with open(sentence_file_path,'r') as f:
    for line in f:
      entry = json.loads(line)
      data.append(entry)
  return data

In [None]:
data=load_file(sentence_file_path)
data

[{'id': '537c5208-d38e-496f-b57e-d1d471823073',
  'sentence': 'INDIAN PREMIER LEAGUE SEASON-3 Questions: 1'},
 {'id': '72957c3d-f4e2-41b6-8f0b-7d250bbd3ade',
  'sentence': 'How do you connect the five NGOs-Akanksha, Nanhi Kali, Pratham, Teach for India and Ummeed with the Mumbai Indians team? 2'},
 {'id': '65b07a9c-7e4e-41c9-a397-84210a04ad2d',
  'sentence': 'His case is the perfect rags to riches story'},
 {'id': '15e2166b-c6e1-44c3-b181-4e1c576f75ab',
  'sentence': 'The story of a small village boy realizing his dream to play for India'},
 {'id': '16c58c67-241c-44e2-aa11-726284c5da5a',
  'sentence': "He was a late addition to the Kolkata Knight Riders' squad for the inaugural season of the IPL"},
 {'id': '156e49ab-b026-4657-94dd-d0ccd6f0d832',
  'sentence': 'His first India call-up was for the Twenty-20s against Sri Lanka in 2009 and his maiden international wicket was that of Sanath Jayasuriya'},
 {'id': '6e21d6f5-c232-46d7-b024-9642e604415e',
  'sentence': 'In IPL-3, he did a dream

In [None]:
query=["who is Ravindra Jadeja"]

qry_emb=generate_text_embeddings(query)

In [None]:
#qry_emb

In [None]:
response = lakeside_index_ep.find_neighbors(
    deployed_index_id = index_name,
    queries = [qry_emb[0]],
    num_neighbors = 10
)

In [None]:
matching_ids = [neighbor.id for sublist in response for neighbor in sublist]

context = generate_context(matching_ids,data)
prompt=f"Based on the context delimited in backticks, answer the query. ```{context}``` {query}"

chat = model.start_chat(history=[])
response = chat.send_message(prompt)
print(response.text)

Ravindra Jadeja is an Indian international cricketer. He is an all-rounder, meaning he is a skilled batsman and bowler.

