In [1]:
import os
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter # split the text?
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma # connect to chromadb?
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

os.environ["OPENAI_API_KEY"]=""
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGHCAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = ''

In [2]:
question = 'What kinds of films am I into? Give example titles.'
document = 'Commercial movies that made by a committee are not my cup of tea. Take the MCU movies for example. All of the follow the same premise, except for the event films, and are created just for a quick cash grab. Movies by auteurs like Chris Nolan, Denis Villenuve or the graphic heavyweights like Zack Snyder and Michael Bay on the other hand is very different. Through the screenplay, these writer-directors are able to make me feel what they did when penning down the script. Ofocurse, when speaking about autuers, I simply cannot leave out Martin Scorsese from the conversation!'

In [3]:
import tiktoken

def find_num_tokens(string: str, encoding_name: str):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

find_num_tokens(question, 'cl100k_base')

12

In [4]:
from langchain_openai import OpenAIEmbeddings
embed = OpenAIEmbeddings()
query_embed = embed.embed_query(question)
document_embed = embed.embed_query(document)
len(query_embed)

1536

In [5]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_embed, document_embed)
print(similarity)

0.8051070394583488


In [6]:
loader = WebBaseLoader(
    web_paths=('https://lilianweng.github.io/posts/2023-06-23-agent/',),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=('post-content', 'post-title', 'post-header')
        )
    ),
)

docs = loader.load()

In [7]:
#split and embed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [8]:
retriever = vectorstore.as_retriever()