# RAG on NPS
Retrieval Augmented Generation on National Park Service

In [100]:
import os
import json
from pathlib import Path
from pprint import pprint

import pandas as pd
from bs4 import BeautifulSoup
import dotenv

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


In [2]:
# load the environmenal variables from the .env file
dotenv.load_dotenv()

True

In [4]:
print(os.environ['OPENAI_API_KEY'])
print(os.environ['HF_API_KEY'])

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
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


## Data ingestion

In [None]:
# Use gpt-crawler to crawl a website and save the output to a file

# config.ts: 

# import { Config } from "./src/config";
# export const defaultConfig: Config = {
#   url: "https://www.nps.gov/index.htm",
#   match: "https://www.nps.gov/**/*.htm",
#   maxPagesToCrawl: 100000,
#   outputFileName: "output_all_parks.json",
# };

In [16]:
# Load the crawled data from GCP-crawler
# file_path='./output_3000.json'
file_path='./output_5000_direct1.json'
# file_path='./output_all_100000.json'

data = json.loads(Path(file_path).read_text())

In [17]:
# Inspect visually the loaded data
data[0]

{'title': '"Immediately evacuate the fort" (U.S. National Park Service)',
 'url': 'https://www.nps.gov/articles/fort-george.htm',
 'html': 'Skip to global NPS navigation\nSkip to the main content\nSkip to the footer section\n National Park Service\n SEARCH\n \nOPEN\nMENU\n"Immediately evacuate the fort"\n\nWhile one commander’s star would fall, another’s rose to prominence as American Colonel Winfield Scott mounted a brilliant amphibious operation against Fort George at the mouth of the Niagara River.\n\n\nAn 1817 drawing of the American naval bombardment and landing at Niagara.\n\nThe Battle of Fort George from the Philadelphia Portfolio, 1817 Archives of Ontario Photographic Collection\n\nHidden by fog on the morning of May 27th, 1813, ships carrying American soldiers left Fort Niagara in New York. They crossed the mouth of the Niagara River and landed on the opposite shore in Upper Canada.\xa0Their aim was to capture the town of Niagara\xa0and nearby Fort George from the British as 

In [87]:
# import pandas as pd

# df = pd.DataFrame(data)

In [88]:
# Use Beautiful Soup to format html into txt
data_txt = []
for d in data:
    data_txt.append(
        "TITLE: "+d['title']+" \n"+
        "URL: "+d['url']+" \n"+
        "HTML: "+ BeautifulSoup(d['html']).get_text()
    )

In [None]:
# # Inspect the formatted data
# pprint(data_txt[0])

In [89]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

In [90]:
# Convert plain text to documents
docs = text_splitter.create_documents(data_txt)

In [91]:
# Inspect the documents
docs[3:10]

[Document(page_content='Dearborn was eventually recalled for his ineffectual command. But in a war full of incompetent commanders, Winfield Scott’s performance at Fort George had proved an exception. Scott earned promotion and accolades as a rising star for his planning and execution of one of the most brilliant actions of the war.\n\nYOU MIGHT ALSO LIKE\n“Come All You Bold Canadians” \nARTICLE\n“America was, and ever has been, the country of my choice.” \nARTICLE\n“Was there ever a war so unreasonable, so wicked, so abominable?” \nARTICLE\nFortifications of Quebec National Historic Site \nPLACE\nKingston Navy Yard Historic Site \nPLACE\n“As to the People of Europe, public opinion was most decidedly in our favor.” \nARTICLE\n TAGS\nwar of 1812 fighting the battles voices canada\n\nLast updated: March 5, 2015\n\nWas this page helpful?\nYes\nNo\n\nAn official form of the United States government. Provided by Touchpoints\n\n\nDownload the official NPS app before your next visit'),
 Docume

In [92]:
pprint(docs[2].to_json()['kwargs']['page_content'])

('Two days later, under covering fire from the ships, light infantry led by '
 'Colonel Winfield Scott\xa0began landing on the beachhead. The British '
 'commander—outnumbered four to one, and facing attack from several directions '
 'at once—decided to evacuate the fort rather than risk being surrounded. With '
 'the British on the run, American soldiers quickly captured the town and '
 'fort. For a time, the invasion of Upper Canada looked promising indeed, with '
 'only minor losses on the American side. \xa0\n'
 '\n'
 'But through months of delay and indecisiveness, General Dearborn squandered '
 'his good fortune by failing to follow up his initial success. British forces '
 'eventually recaptured Fort George as well as the Americans’ Fort Niagara. '
 'Together, those forts controlled the mouth of the Niagara River, a vital '
 'anchorage for any ships bound for the western part of Lake Ontario. It '
 'remained in British hands for the remainder of the war.')


In [93]:
docs_split = []
for d in docs:
    docs_split.append(d.to_json()['kwargs']['page_content'])

In [106]:
df = pd.DataFrame(docs_split, columns=['text'])

In [107]:
df.sample(5)

Unnamed: 0,text
17820,631-569-2100\n\nContact Us \nTOOLS\nFAQ\nSite ...
14073,An official form of the United States governme...
3977,"Over the course of a growing season, ozone can..."
30160,TITLE: Adams National Historical Park (U.S. Na...
26039,“threatened to put a hole in his accuser big e...


In [111]:
pprint(df.iloc[0, 0])

('TITLE: "Immediately evacuate the fort" (U.S. National Park Service) \n'
 'URL: https://www.nps.gov/articles/fort-george.htm \n'
 'HTML: Skip to global NPS navigation\n'
 'Skip to the main content\n'
 'Skip to the footer section\n'
 ' National Park Service\n'
 ' SEARCH\n'
 ' \n'
 'OPEN\n'
 'MENU\n'
 '"Immediately evacuate the fort"\n'
 '\n'
 'While one commander’s star would fall, another’s rose to prominence as '
 'American Colonel Winfield Scott mounted a brilliant amphibious operation '
 'against Fort George at the mouth of the Niagara River.\n'
 '\n'
 '\n'
 'An 1817 drawing of the American naval bombardment and landing at Niagara.\n'
 '\n'
 'The Battle of Fort George from the Philadelphia Portfolio, 1817 Archives of '
 'Ontario Photographic Collection')


## Build the vector database

In [114]:
# # Hugging Face Embedding
# modelPath = "all-MiniLM-L6-v2"
# model_kwargs = {'device':'cpu'}
# encode_kwargs = {'normalize_embeddings':False}

# embeddings = HuggingFaceEmbeddings(
#   model_name = modelPath,  
#   model_kwargs = model_kwargs,
#   encode_kwargs=encode_kwargs
# )
import chromadb.utils.embedding_functions as embedding_functions

huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key=os.environ['HF_API_KEY'],
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [120]:
import chromadb

persist_directory = "chromadb"
client = chromadb.Client(Settings(persist_directory=persist_directory))
# chroma_client = chromadb.Client(Settings(persist_directory=persist_directory,chroma_db_impl="duckdb+parquet",))

collection = client.create_collection('nps', embedding_function=huggingface_ef)

In [124]:
client.delete_collection('nps')

In [121]:
docs=df['text'].tolist() 
id= [str(x) for x in df.index.tolist()]


In [127]:
docs[0]

'TITLE: "Immediately evacuate the fort" (U.S. National Park Service) \nURL: https://www.nps.gov/articles/fort-george.htm \nHTML: Skip to global NPS navigation\nSkip to the main content\nSkip to the footer section\n National Park Service\n SEARCH\n \nOPEN\nMENU\n"Immediately evacuate the fort"\n\nWhile one commander’s star would fall, another’s rose to prominence as American Colonel Winfield Scott mounted a brilliant amphibious operation against Fort George at the mouth of the Niagara River.\n\n\nAn 1817 drawing of the American naval bombardment and landing at Niagara.\n\nThe Battle of Fort George from the Philadelphia Portfolio, 1817 Archives of Ontario Photographic Collection'

In [122]:
collection.add(
    documents=docs,
    ids=id
)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [123]:
collection.count()

0

TypeError: 'Collection' object is not subscriptable

In [None]:
client = chromadb.Client()
collection = client.get_or_create_collection("oscars-2023",embedding_function=openai_ef)

docs=df["text"].tolist() 
ids= [str(x) for x in df.index.tolist()]

collection.add(
    documents=docs,
    ids=ids
)


vector=text_embedding("Nominations for music")

results=collection.query(
    
    query_embeddings=vector,
    n_results=15,
    include=["documents"]
)

res = "\n".join(str(item) for item in results['documents'][0])

prompt=f'```{res}```who won the award for the original song'

messages = [
        {"role": "system", "content": "You answer questions about 95th Oscar awards."},
        {"role": "user", "content": prompt}
]
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0
)
response_message = response["choices"][0]["message"]["content"]

print(response_message)