### Install dependencies

In [None]:
# pip install -r requirements.txt

In [None]:
# pip install "unstructured[md]"

### Setup database

In [1]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import nltk
from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
import openai
from dotenv import load_dotenv
import os
import shutil

In [4]:
DATA_PATH = "data"
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.md")
    documents = loader.load()
    return documents

In [5]:
# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True,)
chunks = text_splitter.split_documents(load_documents())

In [6]:
CHROMA_PATH = "chroma"
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY'] # Will need .env file with API key
db = Chroma.from_documents(chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH)

### Query database

In [13]:
import argparse
from dataclasses import dataclass
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [14]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

In [28]:
def Response(query):
    results = db.similarity_search_with_relevance_scores(query, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
    
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query)
    print(prompt)
    
    model = ChatOpenAI()
    response_text = model.invoke(prompt)
    
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [29]:
Response("What is NLTK?")

Human: 
Answer the question based only on the following context:

NLTK defines an infrastructure that can be used to build NLP programs in Python. It provides basic classes for representing data relevant to natural language processing; standard interfaces for performing tasks such as part-of-speech tagging, syntactic parsing, and text classification; and standard

---

The book is based on the Python programming language together with an open source library called the Natural Language Toolkit (NLTK). NLTK includes extensive software, data, and documentation, all freely downloadable from http://nltk.org/. Distributions are provided for Windows, Macintosh and Unix

---

NLTK was originally created in 2001 as part of a computational linguistics course in the Department of Computer and Information Science at the University of Pennsylvania. Since then it has been developed and expanded with the help of dozens of contributors. It has now been adopted in courses in

---

Answer the question b

In [30]:
Response("How would I get a word frequency distribution?")

Human: 
Answer the question based only on the following context:

Contrast this situation with frequency distributions (3), where we specify a word, and get back a number, e.g. fdist['monstrous'], which tells us the number of times a given word has occurred in a text. Look-up using words is familiar to anyone who has used a dictionary. Some more examples are

---

../images/tally.png
Figure 3.1: Counting Words Appearing in a Text (a frequency distribution)

---

A frequency distribution is a collection of items along with their frequency counts (e.g., the words of a text and their frequency of appearance).

---

Answer the question based on the above context: How would I get a word frequency distribution?

Response: content="To get a word frequency distribution, you would specify a word and use the notation fdist['word'] to retrieve the number of times that word has occurred in a text. This concept is similar to looking up a word in a dictionary." response_metadata={'token_usage': {'co