# RAG on NPS
Retrieval Augmented Generation on National Park Service

In [1]:
import os
import json
from pathlib import Path
from pprint import pprint

from bs4 import BeautifulSoup
import dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


In [None]:
import os
import json
from pathlib import Path
from pprint import pprint

from bs4 import BeautifulSoup
import dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


In [2]:
# load the environmenal variables from the .env file
dotenv.load_dotenv()
print(os.environ['OPENAI_API_KEY'])

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


In [3]:
VECTOR_STORE_BUILD = False  # True for building the vector store, False for loading it from disk
VECTOR_STORE_PATH = "./faiss_index_all_parks"

# Building a new vector store (VECTOR_STORE_BUILD=True) was Done in Google Colab for the V100 GPU. 
# https://colab.research.google.com/drive/1_TL_6loSJp38WWWocHitWLvIdMYaVVpx?usp=sharing

# Need manually download faiss_index files from the following Google Drive Link:
# https://drive.google.com/drive/folders/1-7gHOMApKTz69RM5CWQ6hd5gApEBST2s?usp=sharing


## Data ingestion

In [4]:
# Use gpt-crawler to crawl a website and save the output to a file

# config.ts: 

# import { Config } from "./src/config";
# export const defaultConfig: Config = {
#   url: "https://www.nps.gov/index.htm",
#   match: "https://www.nps.gov/**/*.htm",
#   maxPagesToCrawl: 100000,
#   outputFileName: "output_all_parks.json",
# };

In [5]:
# Load the crawled data from GCP-crawler
# file_path='./output_grca_clean.json'  # small from 5,000 webpages about Grand Canyon National Park
file_path='./output_all_parks_clean.json'  # large from 100,000 webpages about all national parks in the US

data = json.loads(Path(file_path).read_text())

In [6]:
# Inspect visually the loaded data
data[0]

{'html': 'Opinion: Think about ways men and women on San Nicolas Island might have shared the division of labor. Do you agree or disagree with the custom that women in Karana’s tribe were not allowed to make weapons? Remember to consider what it might have been like to live in her particular society. Use reasons and information to support your point of view.\nInformative/explanatory: Describe the mating and breeding habits of northern elephant seals. Gather information from Voices from the Field to develop the topic. Draw evidence from informational text.\nNarrative: Imagine you are Karana watching a fight between two bull elephant seals. Write a narrative describing the battle. Include details about what you might see, hear, and smell.\nAn official form of the United States government. Provided by Touchpoints\nDownload the official NPS app before your next visit',
 'title': 'Teacher Resources: Chapter 13 - Island of the Blue Dolphins (U.S. National Park Service)',
 'url': 'https://www

In [7]:
# Use Beautiful Soup to format html into txt
data_txt = []
for d in data:
    data_txt.append(
        "TITLE: "+d['title']+" \n"+
        "URL: "+d['url']+" \n"+
        "HTML: "+ BeautifulSoup(d['html']).get_text()
    )

  "HTML: "+ BeautifulSoup(d['html']).get_text()


In [57]:
len(data_txt)

99873

In [8]:
# Inspect the formatted data
pprint(data_txt[0])

('TITLE: Teacher Resources: Chapter 13 - Island of the Blue Dolphins (U.S. '
 'National Park Service) \n'
 'URL: https://www.nps.gov/subjects/islandofthebluedolphins/teacher-13.htm \n'
 'HTML: Opinion: Think about ways men and women on San Nicolas Island might '
 'have shared the division of labor. Do you agree or disagree with the custom '
 'that women in Karana’s tribe were not allowed to make weapons? Remember to '
 'consider what it might have been like to live in her particular society. Use '
 'reasons and information to support your point of view.\n'
 'Informative/explanatory: Describe the mating and breeding habits of northern '
 'elephant seals. Gather information from Voices from the Field to develop the '
 'topic. Draw evidence from informational text.\n'
 'Narrative: Imagine you are Karana watching a fight between two bull elephant '
 'seals. Write a narrative describing the battle. Include details about what '
 'you might see, hear, and smell.\n'
 'An official form of the U

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

In [10]:
# Convert plain text to documents
docs = text_splitter.create_documents(data_txt)

In [11]:
# Inspect the documents
docs[3:10]

[Document(page_content='TITLE: Steve Schwartz ch12 - Island of the Blue Dolphins (U.S. National Park Service) \nURL: https://www.nps.gov/subjects/islandofthebluedolphins/schwartz-ch12.htm \nHTML: Steve Schwartz, US Navy archeologist (retired), San Nicolas Island, discusses the location of the whalebone hut found in the 1930s.\nIn 1939, archeologist and historian Arthur Woodward of the Los Angeles County Museum of Natural History visited San Nicolas Island. He brought with him a copy of Captain Nidever’s and Carl Dittman’s accounts of the discovery of the Lone Woman in 1853. Using the accounts, Woodward was able to find the approximate location where the Lone Woman was found in a small hut, or windbreak. He found a scattering of whalebones like the ones described as being part of the Lone Woman’s windbreak. From this discovery, Woodward believed he had found the remains of the Lone Woman’s actual hut.'),
 Document(page_content='In 1940, he returned and photographed the site as he found 

In [12]:
pprint(docs[2].to_json()['kwargs']['page_content'])

('TITLE: Teacher Resources: Chapter 11 - Island of the Blue Dolphins (U.S. '
 'National Park Service) \n'
 'URL: https://www.nps.gov/subjects/islandofthebluedolphins/teacher-11.htm \n'
 'HTML: Opinion: What factors did Karana consider in choosing the location for '
 'her new home? Would you have chosen the same location? Why or why not? Use '
 'evidence from the story to support your claim.\n'
 'Informative/explanatory: Describe some of the ways Karana protects her food '
 'and possessions from wild animals. Develop the topic with facts and other '
 'information related to the topic.\n'
 'Narrative: Karana plans to kill the wild dogs. Imagine you are a wild dog. '
 'How might you react when Karana approaches your cave? Use sensory details to '
 'convey experiences and events precisely.\n'
 'An official form of the United States government. Provided by Touchpoints\n'
 'Download the official NPS app before your next visit')


## Build the vector database

In [13]:
# Indexing in a new vector store

modelPath = "all-MiniLM-L6-v2"

if VECTOR_STORE_BUILD:    
    model_kwargs = {'device':'cuda'}  # needs GPU to run in a reasonable amount of time. Too slow on CPU.
    encode_kwargs = {'normalize_embeddings':False}

    embeddings = HuggingFaceEmbeddings(
      model_name = modelPath,  
      model_kwargs = model_kwargs,
      encode_kwargs=encode_kwargs
    )

    db = await FAISS.afrom_documents(docs, embeddings)
    db.save_local(VECTOR_STORE_PATH)
    # Comments: takes 8 min 27 sec to index 509,348 1000-chunk_size chunks / 782 MB faiss index file / 442 MB pickle file
else:  
    # Load pre-indexed vector stores
    model_kwargs = {'device':'cpu'}
    encode_kwargs = {'normalize_embeddings':False}

    embeddings = HuggingFaceEmbeddings(
      model_name = modelPath,  
      model_kwargs = model_kwargs,
      encode_kwargs=encode_kwargs
    )

    db = FAISS.load_local(VECTOR_STORE_PATH, embeddings)

## Build the text2text generation pipeline

In [14]:
!huggingface-cli whoami

atoultaro


## Target Question

In [50]:
# query = "What are the names of national parks in US?"
# query = "What are the names of trails in the Grand Canyon national parks?"
# query = "What to do in the US national parks in summer"
# query = "Do you need a pass to enter or make reservation to national parks?"
# query = "What kinds of animals in the national parks?"
# query = "Which phone number can I call to ask about road condition in Grand Canyon?"
# query = "How many national parks in the United States? Give me ten examples."
# query = "How many national parks in the US West Coast?"
# query = "Please name all the trails' name in Grand Canyon."
# query = "Could you please design a trip with three adjacent national parks?"
# query = "What are the most famous national parks in New England? Name twenty."
# query = "What activities can Boston offer for a family trip?"
# query = "What activities in Washington DC and its area in parks of NPS for a family trip? Name ten."
# query = "What activities in state of New York and its area in parks of NPS for a family trip? Name ten."
# query = "Could you please help come up with a plan for a three-day trip in Southern California for National Parks or other parks? Thanks."
# query = "Could you please summarize the history of national parks in the United States?"
# query = "What are the most difficult hiks in California's national parks?"
# query = "What are the most difficult hiks in US Northwest's national parks or monuments?"
# query = "What are the most difficult hiks in US Northeast's national parks or monuments?"
query = "Please help design a cross-country trip for me. I want to visit all the national parks in the US. I have 30 days."

In [51]:
# # Inspect queried documents from the vectorstore
# searchDocs = db.similarity_search(query)

# print("How many retrieved documents are there? "+str(len(searchDocs)))

# for i in searchDocs:
#     print(i.page_content)

In [52]:
system_template = "You are knowledagble travel guide of anwering questions on the national parks and monuments in the United States. "

template = """Use the following pieces of retrieved context to answer the question. If there's no required information in the context, answer from your own knowledgable and self identify it when you do this.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("human", template),
])

print(prompt)

# Use three sentences maximum and keep the answer concise.please 
# If you don't know the answer, just say that you don't know. 
# Use three sentences maximum and keep the answer concise.


input_variables=['context', 'question'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are knowledagble travel guide of anwering questions on the national parks and monuments in the United States. ')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of retrieved context to answer the question. If there's no required information in the context, answer from your own knowledgable and self identify it when you do this.\nQuestion: {question} \nContext: {context} \nAnswer:\n"))]


In [53]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.8)

rag_chain = (
    {"context": db.as_retriever(),  
     "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

result = rag_chain.invoke(query)

In [56]:
pprint(result)

('Designing a cross-country trip to visit all the national parks in the US '
 'within 30 days would be quite challenging due to the vast number of parks '
 'and the distances between them. However, it is possible to create a general '
 'itinerary that includes some of the most popular and iconic national parks.\n'
 '\n'
 'Here is a suggested itinerary for a cross-country trip to visit national '
 'parks in the US:\n'
 '\n'
 '1. Start in the West Coast:\n'
 '   - Begin in Seattle, Washington and visit North Cascades National Park.\n'
 '   - Travel to Olympic National Park.\n'
 '   - Drive south to Mount Rainier National Park.\n'
 '\n'
 '2. Head to the Southwest:\n'
 '   - Drive to Crater Lake National Park in Oregon.\n'
 '   - Continue south to California and visit Redwood National and State '
 'Parks.\n'
 '   - Explore Yosemite National Park.\n'
 '   - Visit Kings Canyon National Park and Sequoia National Park.\n'
 '\n'
 '3. Explore the Desert Southwest:\n'
 '   - Drive through Nevada 