In [31]:
# !pip3 install langchain
# !pip3 install pinecone-client
# !pip3 install tiktoken
# !pip3 install openai

In [32]:
import os
from dotenv import load_dotenv, find_dotenv
import pypdf
import pandas as pd

In [33]:
product_info = pd.read_csv('current_wearables.csv', index_col=0)
product_info['product_id'] = product_info['product_id'].astype(str)

In [34]:
product_info.dtypes

product_url            object
product_id             object
product_name           object
product_category       object
product_subcategory    object
part_number            object
product_manual_url     object
file_name              object
dtype: object

In [35]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader  # We are importing here to prevent circular dependency.
        print(f"Loading {file}")
        loader = PyPDFLoader(file)  # This also works with online PDFs
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f"Loading {file}")
        loader = Docx2txtLoader(file)
    # Add as many extension types as we want.
    else:
        print('Document format is not supported!')
        return None
    
    data = loader.load()
    return data

In [36]:
def chunk_data(data, chunk_size=256, chunk_overlap=0):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    return chunks

In [37]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    return total_tokens
    # print(f'Total Tokens: {total_tokens}')
    # print(f'Embedding cost in USD: ${total_tokens / 1000 * 0.0004:.6f}')

In [38]:
exception_list = []
def print_total_embedding_cost(exception_list=exception_list):
    cost = 0
    for file_name in product_info['file_name']:
        try:
            data = load_document(f"../docs/wearables/{file_name}")
            chunks = chunk_data(data=data, chunk_size=256, chunk_overlap=20)
        except:
            print(f"Exception with file name {file_name}")
            exception_list.append(file_name)
            continue
        if cost > 0:
            cost = print_embedding_cost(chunks) + old_cost
            old_cost = cost
        else:
            cost = print_embedding_cost(chunks)
            old_cost = cost
    print(f'Total embedding cost in USD: ${cost / 1000 * 0.0004:.6f}')
    return cost, exception_list

In [39]:
# cost, exceptions = print_total_embedding_cost()

In [40]:
def insert_or_fetch_embeddings(index_name, chunks):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()

    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
                  environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f"Index {index_name} already exists: Loading embeddings ...")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print("Done")
    else:
        print(f"Creating index {index_name} and embeddings ...")
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print("Done")
        
    return vector_store

In [12]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
                  environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print(f"Deleting all indexes {indexes}...")
        for index in indexes:
            pinecone.delete_index(index)
        print("Done")
    else:
        print(f"Deleting index {index_name}...", end="")
        pinecone.delete_index(index_name)
        print("Done")


In [13]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ['garmin']...
Done


In [16]:
all_chunks = []
for file_name in product_info['file_name']:
    try:
        metadata_info = product_info.loc[product_info['file_name'] == file_name]
        data = load_document(f"../docs/wearables/{file_name}")
        chunks = chunk_data(data=data, chunk_size=256, chunk_overlap=20)
        metadata = {
            'product_url': metadata_info['product_url'].values[0],
            'product_id': metadata_info['product_id'].values[0],
            'product_name': metadata_info['product_name'].values[0],
            'product_category': metadata_info['product_category'].values[0],
            'product_subcategory': metadata_info['product_subcategory'].values[0],
            'part_number': metadata_info['part_number'].values[0],
            'product_manual_url': metadata_info['product_manual_url'].values[0]
        }
        for chunk in chunks:
            chunk.metadata.update(metadata)
            all_chunks.append(chunk)
    except:
        print(f"File not found: {file_name}. Skipping...")
        continue

Loading ../docs/wearables/Forerunner_965_OM_EN-US.pdf
Loading ../docs/wearables/Bounce_OM_EN-US.pdf
Loading ../docs/wearables/MARQ_Adventurer_(Gen_2)_OM_EN-US.pdf
Loading ../docs/wearables/Instinct_Crossover_-_Standard_Edition_OM_EN-US.pdf
Loading ../docs/wearables/vivomove_Trend_OM_EN-US.pdf
Loading ../docs/wearables/vivomove_3S_OM_EN-US.pdf
Loading ../docs/wearables/Lily_-_Classic_Edition_OM_EN-US.pdf
Loading ../docs/wearables/Instinct_2_Solar_OM_EN-US.pdf
Loading ../docs/wearables/Enduro_2_OM_EN-US.pdf
Loading ../docs/wearables/Venu_2_Plus_OM_EN-US.pdf
Loading ../docs/wearables/Forerunner_55_OM_EN-US.pdf
Loading ../docs/wearables/Forerunner_955_Solar_OM_EN-US.pdf
Loading ../docs/wearables/Forerunner_255_OM_EN-US.pdf
Loading ../docs/wearables/Venu_3S_OM_EN-US.pdf
Loading ../docs/wearables/vivoactive_5_OM_EN-US.pdf
Loading ../docs/wearables/epix_Pro_(Gen_2)__Sapphire_Edition_|_47_mm_OM_EN-US.pdf
Loading ../docs/wearables/fenix_7_Pro__Sapphire_Solar_Edition_OM_EN-US.pdf
Loading ../docs

In [17]:
print(len(all_chunks), len(chunks))

32584 633


In [18]:
all_chunks[953].metadata

{'source': '../docs/wearables/Forerunner_965_OM_EN-US.pdf',
 'page': 98,
 'product_url': 'https://www.garmin.com/en-US/p/886725',
 'product_id': '886725',
 'product_name': 'Forerunner 965',
 'product_category': 'Sports & Fitness',
 'product_subcategory': 'Running',
 'part_number': '010-02809-00',
 'product_manual_url': 'https://support.garmin.com/en-US/?partNumber=010-02809-00&tab=manuals'}

In [19]:
vector_store = insert_or_fetch_embeddings(index_name='garmin', chunks=all_chunks)

  from tqdm.autonotebook import tqdm


Index garmin already exists: Loading embeddings ...
Done


In [20]:
product_info.product_name

0                                 Forerunner 965
1                                         Bounce
2                        MARQ Adventurer (Gen 2)
3          Instinct Crossover - Standard Edition
4                                 vivomove Trend
5                                    vivomove 3S
6                         Lily - Classic Edition
7                               Instinct 2 Solar
8                                       Enduro 2
9                                    Venu 2 Plus
10                                 Forerunner 55
11                          Forerunner 955 Solar
12                                Forerunner 255
13                                       Venu 3S
14                                  vivoactive 5
15    epix Pro (Gen 2)  Sapphire Edition | 47 mm
16           fenix 7 Pro  Sapphire Solar Edition
17                               Forerunner 265S
18                                 Forerunner 45
19                          Approach S70 - 47 mm
20                  

In [22]:
vector_store.as_retriever(metadata_info={'product_name': 'Bounce'})

VectorStoreRetriever(tags=['Pinecone', 'OpenAIEmbeddings'], vectorstore=<langchain.vectorstores.pinecone.Pinecone object at 0x17a6f1bd0>)

In [24]:
def ask_and_get_answer(vector_store, q, product):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k': 5,
                                                                                  'filter': {
                                                                                      'product_name': {
                                                                                          '$eq': product
                                                                                          }
                                                                                        }})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)  #stuff is the default and uses all of the text
    answer = chain.run(q)
    return answer

In [28]:
q = "What is this manual for?"
a = ask_and_get_answer(vector_store, q=q, product='Forerunner 965')

In [29]:
print(a)

This manual is for the FORERUNNER® 965 WATCH.


In [27]:
product_info

Unnamed: 0,product_url,product_id,product_name,product_category,product_subcategory,part_number,product_manual_url,file_name
0,https://www.garmin.com/en-US/p/886725,886725,Forerunner 965,Sports & Fitness,Running,010-02809-00,https://support.garmin.com/en-US/?partNumber=0...,Forerunner_965_OM_EN-US.pdf
1,https://www.garmin.com/en-US/p/714945,714945,Bounce,Sports & Fitness,Just for Kids,010-02448-02,https://support.garmin.com/en-US/?partNumber=0...,Bounce_OM_EN-US.pdf
2,https://www.garmin.com/en-US/p/783467,783467,MARQ Adventurer (Gen 2),Sports & Fitness,Luxury Smartwatches,010-02648-30,https://support.garmin.com/en-US/?partNumber=0...,MARQ_Adventurer_(Gen_2)_OM_EN-US.pdf
3,https://www.garmin.com/en-US/p/819761,819761,Instinct Crossover - Standard Edition,Outdoor Recreation,Adventure Watches,010-02730-13,https://support.garmin.com/en-US/?partNumber=0...,Instinct_Crossover_-_Standard_Edition_OM_EN-US...
4,https://www.garmin.com/en-US/p/785411,785411,vivomove Trend,Sports & Fitness,Fitness Tracking,010-02665-02,https://support.garmin.com/en-US/?partNumber=0...,vivomove_Trend_OM_EN-US.pdf
5,https://www.garmin.com/en-US/p/662790,662790,vivomove 3S,Sports & Fitness,Fitness Tracking,010-02238-01,https://support.garmin.com/en-US/?partNumber=0...,vivomove_3S_OM_EN-US.pdf
6,https://www.garmin.com/en-US/p/719696,719696,Lily - Classic Edition,Sports & Fitness,Fitness Tracking,010-02384-A2,https://support.garmin.com/en-US/?partNumber=0...,Lily_-_Classic_Edition_OM_EN-US.pdf
7,https://www.garmin.com/en-US/p/775697,775697,Instinct 2 Solar,Outdoor Recreation,Adventure Watches,010-02627-16,https://support.garmin.com/en-US/?partNumber=0...,Instinct_2_Solar_OM_EN-US.pdf
8,https://www.garmin.com/en-US/p/854515,854515,Enduro 2,Outdoor Recreation,Adventure Watches,010-02754-00,https://support.garmin.com/en-US/?partNumber=0...,Enduro_2_OM_EN-US.pdf
9,https://www.garmin.com/en-US/p/730659,730659,Venu 2 Plus,Sports & Fitness,Fitness Tracking,010-02496-02,https://support.garmin.com/en-US/?partNumber=0...,Venu_2_Plus_OM_EN-US.pdf


In [42]:
def insert_or_fetch_embeddings(index_name, chunks=False):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()

    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'),
                  environment=os.environ.get('PINECONE_ENV'))

    if index_name in pinecone.list_indexes():
        print(f"Index '{index_name}' already exists: Loading embeddings...")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print("Done")
    else:
        print(f"Creating index '{index_name}' and embeddings...")
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks,
                                               embeddings,
                                               index_name=index_name)
        print(f"Documents embedded to {index_name} index!")

    return vector_store

In [43]:
vector_store = insert_or_fetch_embeddings(index_name='garmin')

Index 'garmin' already exists: Loading embeddings...
Done


In [71]:
llm_model = 'gpt-3.5-turbo'
search_type = 'similarity'
chain_type = 'stuff'
k = 5
temperature = 1

def ask_and_get_answer(vector_store, q, filter_args=None,
                       model=llm_model, k=k, temperature=temperature,
                       search_type=search_type, chain_type=chain_type):
    llm = ChatOpenAI(model=model, temperature=temperature)
    if filter_args is not None:
        retriever = vector_store.as_retriever(
            search_type=search_type,
            search_kwargs={'k': k,
                           'filter': filter_args})
    else:
        retriever = vector_store.as_retriever(
            search_type=search_type,
            search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type=chain_type,  # stuff is the default and uses all of the text
        retriever=retriever)
    answer = chain.run(q)
    return answer

In [72]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

In [73]:

q = 'What is this manual for?'
product = 'Forerunner 965'
# 'filter': {'product_name': {'$eq': product}
a = ask_and_get_answer(vector_store=vector_store, q=q)
print(a)

This manual is for the VÍVOMOVE® 3/3S.


In [78]:
q = 'What is this manual for?'
product = 'Enduro 2'
product_id = '854515'
# 'filter': {'product_name': {'$eq': product}
filter = {'product_name': {'$eq': product},
          'product_id': {'$eq': product_id}}
a = ask_and_get_answer(vector_store=vector_store, q=q,
                       filter_args=filter)
print(a)

This manual is for the ENDURO™ 2 device.
