# SETUP DB

In [3]:
import os
import numpy as np
from dotenv import find_dotenv, load_dotenv  # 3P
from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import (
    Pinecone,  # 3P
    ServerlessSpec,
)
from pinecone_text.sparse import BM25Encoder

load_dotenv(find_dotenv())

index_name = "umag-hybrid-search"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY_UBC"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

bm25_encoder = BM25Encoder.default()

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
)

retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index, top_k=5
)

In [5]:
import json
from pathlib import Path

data_path = Path(r"data\Objectifying_China\tagged\en_contents_doc_chunked.json")

with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)

In [6]:
ids = []
metadatas = []
texts = []

for _, doc in data.items():
    ids.append(doc.pop("id", ))
    texts.append(f"{doc.pop('header')}\n\n{doc.pop('text')}")
    doc.pop("type", None)
    doc["page_idx"] = doc["page_idx"][0]
    metadatas.append(doc)

In [10]:
# check data
print(f"{metadatas[0]=}")
print(f"{ids[0]=}")
print(f"{texts[0]=}")

metadatas[0]={'page_idx': 3, 'img_path': [], 'img_caption': [], 'img_footnote': [], 'time_period': ['ming', 'qing'], 'materiality': ['porcelain'], 'region': ['jingdezhen_kilns'], 'colour': [], 'purpose': ['export', 'decoration'], 'themes': ['technique'], 'exhibit': 'Objectifying China'}
ids[0]='lnwpgxpl'
texts[0]='Objectifying China\n\nMing and Qing Dynasty Ceramics and Their Stylistic Influences Abroad'


In [None]:
# populate database
retriever.add_texts(texts=texts, ids=ids, metadatas=metadatas)

# TEST

In [12]:
from retriever import DefaultRetriever
from utils import *  # (better: import only what you need)
from survey import Preferences

tags = Preferences(
    exhibits = [],
    time_period = ["tang", "qing"],
    materiality = [],
    region = [],
    colour = [],
    purpose = [],
    themes = [],
    additional_interests = [],
)

retriever = DefaultRetriever()

results = retriever._search_tags(tags)

results

[Document(metadata={'colour': ['blue/cobalt', 'white_porcelain/blanc_de_chine'], 'exhibit': 'Objectifying China', 'img_caption': [], 'img_footnote': [], 'img_path': ['output/Objectifying_China/auto/images/4be4703d1132e355f5f44a62d0adbb18f399a5f7b0a112a8a4ebb37f858d0e51.jpg'], 'materiality': ['porcelain', 'earthenware', 'stoneware'], 'page_idx': 11.0, 'purpose': ['court', 'export', 'import', 'decoration'], 'region': ['jingdezhen_kilns', 'east_asia'], 'themes': ['technique', 'symbolism'], 'time_period': ['tang', 'yuan', 'ming', 'qing'], 'score': 1.0}, page_content='Blue-and-white\n\nPerhaps the first truly global commodity, blue-and-white porcelain reached a wider audience than any other type of Chinese ceramic. Sent to markets in Southeast Asia, Japan, Korea, India, the Middle East and even Africa, it inspired potters in major ceramic centres to produce elegant ceramics of their own design; from the rich turquoise fritwares of fifteenth-century Iznik, to elaborately panelled Japanese kr

In [13]:
from retriever import DefaultRetriever
from utils import *  # (better: import only what you need)
from survey import Preferences

tags = Preferences(
    exhibits = [],
    time_period = ["tang", "qing"],
    materiality = [],
    region = [],
    colour = [],
    purpose = [],
    themes = [],
    additional_interests = [],
)

retriever = DefaultRetriever()

results = retriever._retrieve_with_text("history", tags)

results

In [5]:
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv, find_dotenv
from pinecone import Pinecone
from langchain_openai import AzureOpenAIEmbeddings

load_dotenv(find_dotenv())


index_name = "umag-hybrid-search"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY_UBC"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=3072,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"],
    azure_endpoint=os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"],
)
# Initialize Pinecone client or any other necessary components here
retriever = PineconeVectorStore(
    index, embeddings, "context"
).as_retriever()

In [10]:
retriever.invoke("porcelain", filter={}, k=1)

[Document(id='qwdktbdk', metadata={'colour': [], 'exhibit': 'Objectifying China', 'img_caption': [], 'img_footnote': [], 'img_path': [], 'materiality': ['porcelain', 'stoneware', 'earthenware'], 'page_idx': 5.0, 'purpose': [], 'region': [], 'themes': ['technique'], 'time_period': []}, page_content='Porcelain terminology\n\nThe Chinese term ‘ci’ (translated as porcelain in English) refers to all ceramics that are fired at high temperatures, including porcelain and stoneware. In the West, the term porcelain refers specifically to white ceramics made with a special type of clay called kaolin and fired to a temperature of about $1300^{\\circ}\\mathrm{C}$ , which results in a translucent, glassy material that makes a ringing sound when struck. Stoneware is used to refer to related ceramics that are similarly hard and dense, but which are made with grey or brown clay, may or may not be white-bodied, do not transmit light, and are fired to a slightly lower temperature of 1000 to $1250^{\\circ