In [14]:
import os
import openai
from dotenv import load_dotenv

# **BURAYI .env DOSYANIZIN YERI OLACAK SEKILDE DEGISTIRIN***
dotenv_path = 'key.env'
is_loaded = load_dotenv(dotenv_path)

# Load config values

# Setting up the deployment name
chatgpt_model_name = os.getenv('CHATGPT_MODEL')

# This is set to `azure`
openai.api_type = "azure"

# The API key for your Azure OpenAI resource.
openai.api_key = os.getenv("OPENAI_API_KEY")

# The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com"
openai.api_base = os.getenv('OPENAI_API_BASE')

# Currently Chat Completions API have the following versions available: 2023-03-15-preview
openai.api_version = os.getenv('OPENAI_API_VERSION')

deployment_name = os.getenv('DEPLOYMENT_NAME')


print("Are environment variables loaded correctly? ", is_loaded)


Are environment variables loaded correctly?  True


In [1]:
#! pip install lark

# Similarity Search

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

persist_directory = 'docs/chroma/'

embedding = HuggingFaceEmbeddings()

vectordb = Chroma(

    persist_directory=persist_directory,

    embedding_function=embedding

)

print(vectordb._collection.count())

  from .autonotebook import tqdm as notebook_tqdm


418


In [2]:
texts = [

    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",

    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",

    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",

]
smalldb = Chroma.from_texts(texts, embedding=embedding)

question = "Tell me about all-white mushrooms with large fruiting bodies"

smalldb.similarity_search(question, k=2)

smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]

In [4]:
question = "what did they say about matlab?"

docs_ss = vectordb.similarity_search(question,k=3)

docs_ss[0].page_content[:100]

docs_ss[1].page_content[:100]


"So later this quarter, we'll use the discussion sections to talk about things like convex \noptimizat"

In [5]:
docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)

docs_mmr[0].page_content[:100]

docs_mmr[1].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [7]:
question = "what did they say about regression in the third lecture?"

docs = vectordb.similarity_search(

    question,

    k=3,

    filter={"source":"docs/cs229_lectures/MachineLearning-Lecture03.pdf"}

)

for d in docs:
    print(d.metadata)

In [12]:
from langchain.llms import AzureOpenAI

from langchain.retrievers.self_query.base import SelfQueryRetriever

from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [

    AttributeInfo(

        name="source",

        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",

        type="string",

    ),

    AttributeInfo(

        name="page",

        description="The page from the lecture",

        type="integer",

    ),

]

In [27]:
document_content_description = "Lecture notes"

llm = AzureOpenAI(deployment_name="text-davinci-003" , model_name="text-davinci-003" , temperature=0)

retriever = SelfQueryRetriever.from_llm(

    llm,

    vectordb,

    document_content_description,

    metadata_field_info,

    verbose=True

)

In [28]:
question = "what did they say about regression in the third lecture?"

# You will receive a warning about predict_and_parse being deprecated the first time you executing the next line. This can be safely ignored.

docs = retriever.get_relevant_documents(question)

for d in docs:
    
    print(d.metadata)

query='regression' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='docs/cs229_lectures/MachineLearning-Lecture03.pdf') limit=None


In [29]:
from langchain.retrievers import ContextualCompressionRetriever

from langchain.retrievers.document_compressors import LLMChainExtractor

def pretty_print_docs(docs):

    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [30]:
# Wrap our vectorstore

llm = AzureOpenAI(deployment_name="text-davinci-003" , model_name="text-davinci-003", temperature=0)

compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(

    base_compressor=compressor,

    base_retriever=vectordb.as_retriever()

)

In [31]:
question = "what did they say about matlab?"

compressed_docs = compression_retriever.get_relevant_documents(question)

pretty_print_docs(compressed_docs)

# Combining various techniques

compression_retriever = ContextualCompressionRetriever(

    base_compressor=compressor,

    base_retriever=vectordb.as_retriever(search_type = "mmr")

)

Document 1:

"I personally end up using MATLAB quite a bit more often for various reasons."
----------------------------------------------------------------------------------------------------
Document 2:

"I personally end up using MATLAB quite a bit more often for various reasons."
----------------------------------------------------------------------------------------------------
Document 3:

"I personally end up using MATLAB quite a bit more often for various reasons."
----------------------------------------------------------------------------------------------------
Document 4:

"I personally end up using MATLAB quite a bit more often for various reasons."


In [32]:
question = "what did they say about matlab?"

compressed_docs = compression_retriever.get_relevant_documents(question)

pretty_print_docs(compressed_docs)

Document 1:

"I personally end up using MATLAB quite a bit more often for various reasons."
----------------------------------------------------------------------------------------------------
Document 2:

"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms."
----------------------------------------------------------------------------------------------------
Document 3:

"So one more organizational question. I'm curious, how many of you know MATLAB? Wow, cool, quite a lot. Okay. So as part of the — actually how many of you know Octave or have used Octave ? Oh, okay, much smaller number."


In [33]:
from langchain.retrievers import SVMRetriever

from langchain.retrievers import TFIDFRetriever

from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [35]:
# Load PDF

loader = PyPDFLoader("docs\MachineLearning-Lecture01.pdf")

pages = loader.load()

all_page_text=[p.page_content for p in pages]

joined_page_text=" ".join(all_page_text)

In [36]:
# Split

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)

splits = text_splitter.split_text(joined_page_text)

In [37]:
# Retrieve

svm_retriever = SVMRetriever.from_texts(splits,embedding)

tfidf_retriever = TFIDFRetriever.from_texts(splits)

question = "What are major topics for this class?"

docs_svm=svm_retriever.get_relevant_documents(question)

docs_svm[0]

question = "what did they say about matlab?"

docs_tfidf=tfidf_retriever.get_relevant_documents(question)

docs_tfidf[0]



Document(page_content="Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \ngroup the picture into regions. Let me actually blow that up so that you can see it more \nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \ngrouping the image into [inaudible] regions.  \nAnd what Ashutosh and Min did was they then  applied the learning algorithm to say can \nwe take this clustering and us e it to build a 3D model of the world? And so using the \nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \nworld looks like so that they could come up with a 3D model that you can sort of fly \nthrough, okay? Although many people used to th ink it's not possible to take a single \nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \nalgorithm is the first ste