In [None]:
import urllib.request as ur
from bs4 import BeautifulSoup
import textwrap
import nltk
from nltk import tokenize
import joblib

# nltk.download('punkt')
INPUT_FILE = '../data/trekking.txt'
EMBEDDINGS_PATH = '../data/embeddings.joblib'
VECTOR_STORE_PATH = '../data/vector_store.joblib'

In [None]:
from dotenv import load_dotenv
from dotenv import dotenv_values
load_dotenv() 
config = dotenv_values("../.env")

In [None]:
URL = "https://en.wikivoyage.org/wiki/Trekking_in_Nepal"

with ur.urlopen(URL) as url:
    html = url.read()
    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.decompose()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())

    lines = [textwrap.fill(line, 110) for line in lines]

    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    collection = tokenize.sent_tokenize(text)

    with open(INPUT_FILE, 'w', encoding='utf-8') as f:
        for line in collection:
            f.write(f"{line}\n\n")

    # with open("trekking.txt", "w", encoding='utf-8') as f:
    #     f.write(bb)

In [None]:
# Document Loader
from langchain.document_loaders import TextLoader
loader = TextLoader(INPUT_FILE, encoding='utf-8')
documents = loader.load()

documents

In [None]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

len(docs)

In [None]:
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

In [None]:
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

In [None]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":1024}, huggingfacehub_api_token=config["HUGGINGFACE_TOKEN"])

#  llm = HuggingFaceHub(repo_id="EleutherAI/gpt-neox-20b", model_kwargs={"tempearture":0.5, "max_length": 512}, huggingfacehub_api_token=config["HUGGINGFACE_TOKEN"])

In [None]:
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "What is the best season to trek in Nepal?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "Can i get teahouse accomodation in the treks?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What do I need for trekking in Nepal?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What permits are required?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "Where to buy TIMS?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "Summarize this document"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "What are the risks of Trekking in Nepal?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)