In [1]:
import os
import sys
import time
import json
import traceback
import numpy as np
import pandas as pd
import pyarrow
#pip install --upgrade pymupdf
import fitz  # PyMuPDF

from typing import Optional
from dotenv import load_dotenv
from botocore.config import Config

import boto3

# Langchain core and community components
from langchain.embeddings import BedrockEmbeddings, HuggingFaceEmbeddings
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import BedrockChat

# Uncomment if needed
# import dbconnection
# import psycopg2
# from psycopg2 import OperationalError
aws_client = boto3.client(service_name="bedrock-runtime")


In [2]:
# Note you may need to run this in a terminal window if you ge
# python -m spacy download en_core_web_md

In [3]:
#Chunking

In [4]:
# Load the PDF and extract text
def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [5]:
# Chunking function with overlap
def fixed_size_chunking(text, chunk_size=100, overlap_size=20):
    tokens = text.split()
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap_size):
        chunk = tokens[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

In [6]:
# Return a list where each item is the text from one page
def read_pdf_by_page_cleaned(file_path, min_length=50):
    doc = fitz.open(file_path)
    pages = []
    for i, page in enumerate(doc):
        text = page.get_text().strip()
        if len(text) >= min_length:
            pages.append(text)
        else:
            print(f"Skipping page {i+1}: too short")
    return pages


In [7]:
# Use the function
file_path = "./3-Embeddings/data_test/PRIZM.pdf"

cleaned_pages = read_pdf_by_page_cleaned(file_path)

Skipping page 8: too short
Skipping page 10: too short
Skipping page 12: too short
Skipping page 14: too short
Skipping page 16: too short
Skipping page 18: too short
Skipping page 20: too short
Skipping page 22: too short
Skipping page 24: too short
Skipping page 26: too short
Skipping page 28: too short
Skipping page 30: too short
Skipping page 32: too short
Skipping page 34: too short
Skipping page 36: too short
Skipping page 38: too short
Skipping page 40: too short
Skipping page 42: too short
Skipping page 44: too short
Skipping page 46: too short
Skipping page 48: too short
Skipping page 50: too short
Skipping page 52: too short
Skipping page 54: too short
Skipping page 56: too short
Skipping page 58: too short
Skipping page 60: too short
Skipping page 62: too short
Skipping page 64: too short
Skipping page 66: too short
Skipping page 68: too short
Skipping page 70: too short
Skipping page 72: too short
Skipping page 74: too short
Skipping page 76: too short
Skipping page 78: too

In [8]:
#test chunking
#print(f"Total cleaned pages: {len(cleaned_pages)}")
#print(f"\n--- First Clean Page ---\n{cleaned_pages[40]}")

In [9]:
df = pd.DataFrame({
    "page": list(range(1, len(cleaned_pages)+1)),
    "text": cleaned_pages
})


In [10]:
#remove first 6 rows (b/c they are not prizm segments)
df = df.iloc[6:].reset_index(drop=True)

In [11]:
import re

def extract_prizm_segment(text):
    match = re.search(r'\b\d{2}\s[–-]\s.+', text)
    return match.group(0).strip() if match else None

# Apply to your DataFrame
df["prizm_segment"] = df["text"].apply(extract_prizm_segment)

In [12]:
#Embedding

In [13]:
def limit_string_size(x, max_chars=2048):
    # Check if the input is a string
    if isinstance(x, str):
        return x[:max_chars]
    else:
        return x

In [14]:
def clean_value(value):
    value_str = str(value)
    cleaned_value = ''.join(char for char in value_str if char.isalnum() or char.isspace())
    return cleaned_value

In [15]:
def print_top_values(list_stuff: list, num_items: int) -> None:
    i=0
    for item in list_stuff:
        i=i+1
        if i>num_items:
            return None
        print(item)

In [16]:
def embed_documents_with_cohere(texts):
    if isinstance(texts, str):
        texts = [texts]  # promote single string to list
        single = True
    else:
        single = False

    input_type = "clustering"
    truncate = "NONE"
    model_id = "cohere.embed-english-v3"
    json_params = {
        'texts': [limit_string_size(t) for t in texts],
        'truncate': truncate,
        'input_type': input_type
    }
    result = aws_client.invoke_model(
        body=json.dumps(json_params),
        modelId=model_id
    )
    response = json.loads(result['body'].read().decode())
    embeddings = [np.array(vec) for vec in response['embeddings']]
    return embeddings[0] if single else embeddings


In [17]:
# Let's generate a dense vector using Amazon Titan with LangChain
def generate_titan_vector_embedding(text):
    #create an Amazon Titan Text Embeddings client
    embeddings_client = BedrockEmbeddings(region_name="us-west-2") 

    #Invoke the model
    embedding = embeddings_client.embed_query(text)
    return(np.array(embedding))



In [18]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity



In [19]:
# Let's generate a dense vector using Amazon Titan without using a np.array as a return value
def generate_vector_embedding(text):
    #create an Amazon Titan Text Embeddings client
    embeddings_client = BedrockEmbeddings(region_name="us-west-2") 

    #Invoke the model
    embedding = embeddings_client.embed_query(text)
    #Note pgvector does not want a np.array as out manual method
    return(embedding)

In [20]:
dft = df.copy()
dft["embedding"] = embed_documents_with_cohere(dft["text"].tolist())

#export dft as pickle
dft.to_pickle('./3-Embeddings/data_test/PRIZM_Embedded.pkl')

In [21]:
# Show full content without truncation
pd.set_option('display.max_colwidth', None)
print(dft.iloc[0])  # first row

page                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [22]:
#RAG

In [23]:
# Let's setup a query that a user might ask
query = "What Segment makes the most money?"

In [24]:
# Let's search our records for a good semantic search
query_vector = embed_documents_with_cohere(query)

results = []
# Iterate over each row in the DataFrame
for index, row in dft.iterrows():
    # Extract the value from the specified column
    article_embedding = row['embedding']
    results.append((index, cosine_similarity(article_embedding, query_vector)))
    #print (index, value)

results.sort(key=lambda x: x[1], reverse=True)
i = 0
# Print the sorted data
print("Here are a few PRIZM Segments that may match your interest:")
for item in results:
    article_title = df.iloc[item[0]]['prizm_segment']
    print(f"Abtract: '{article_title}' with a cosine match of: {item[1]}")
    i=i+1
    if i == 2:
        break

Here are a few PRIZM Segments that may match your interest:
Abtract: '06 – Winners Circle' with a cosine match of: 0.3801812401302732
Abtract: '14 – Kids & Cul-de-Sacs' with a cosine match of: 0.3705638313140499
