In [42]:
! pip install langchain tiktoken openai pypdf faiss-cpu huggingface_hub python-docx selenium unstructured -q

In [43]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from docx import Document
from langchain.document_loaders import SeleniumURLLoader

from langchain.callbacks import get_openai_callback
from langchain import HuggingFaceHub

In [44]:
# Load the PDF file. English or French only

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', ''])

pdf_doc_path='/content/drive/MyDrive/Data S/data/1594383154-world-war-ii.pdf'
if pdf_doc_path:
    loader = PyPDFLoader(pdf_doc_path)
    pages = loader.load_and_split()

txt_path=''
if txt_path:
    loader=TextLoader(txt_path)
    pages=loader.load()

docx_path=''
if docx_path:
  word_doc=Document(docx_path)
  passages=[p.text for p in word_doc.paragraphs]
  content = "\n".join(passages)
  file_txt_path="./file.txt"
  with open(file_txt_path, "w") as f:
    f.write(content)
  loader=TextLoader(file_txt_path)
  pages=loader.load()

url=['']
if url[0]:
  loader = SeleniumURLLoader(url)
  pages=loader.load()

texts = text_splitter.split_documents(pages)

In [45]:
import tiktoken

def num_tokens(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

full_text=''
for i in range(len(texts)-1):
  #print(docs[i][0].page_content)
  full_text+=texts[i].page_content

tokens=num_tokens(full_text, 'cl100k_base')
PRICE=0.0001 # 0.0001 USD per 1000 tokens
Token_summary={}
Token_summary['1K/tokens']=tokens/1000
Token_summary['$price 1K/tokens']=PRICE
Token_summary['Total_cost(USD)']=tokens/1000*PRICE
Token_summary

{'1K/tokens': 2.648,
 '$price 1K/tokens': 0.0001,
 'Total_cost(USD)': 0.00026480000000000004}

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key="")
vector_db = FAISS.from_documents(texts, embeddings)
vector_db.save_local("/content/drive/MyDrive/Data S/data", index_name=("WW2"))

In [52]:
llm=HuggingFaceHub(huggingfacehub_api_token="",
                   repo_id="tiiuae/falcon-7b-instruct", #
                    model_kwargs={"temperature":0.5 ,
                                 "max_length":512,
                                 "max_new_tokens":200
                                 })

In [49]:
from langchain.chains import RetrievalQA
from langchain.schema import retriever
from langchain.memory import ConversationBufferMemory
from langchain.chains.question_answering import load_qa_chain


vector_db=FAISS.load_local(folder_path="/content/drive/MyDrive/Data S/data",
                          index_name='Python_data_sheet',
                          embeddings=embeddings)
memory = ConversationBufferMemory(memory_key="chat_history")
qa_chain=load_qa_chain(llm=llm, chain_type="stuff")
qa = RetrievalQA(combine_documents_chain=qa_chain,
                                retriever=vector_db.as_retriever(),
                                memory=memory,
                              )


In [53]:
queary='How started WW2?'
results=qa({"query": queary})

In [54]:
results

{'query': 'How started WW2?',
 'chat_history': 'Human: When WWII was started??\nAI: \nWorld War II started in 1939.\nThe Axis Powers:\nGermany, Italy, and Japan\nThe Allies:\nUnited States, United Kingdom, and France\nThe War Ends\nOn 25 May 1945, Germany surrendered unconditionally to the Allied forces.\nThe war was over.\nThe End of World War II\nIn the aftermath of World War II, Germany was divided into four parts, and the country was\nreunified on 23 October 1990.\nThe Allies also had to reconstruct Germany and pay the country a large amount of reparations.\nIn 1948, West Germany was reunited, and the country was given a West German constitution.\nIn 1945, Japan was also divided into four parts, and the country was also reconstructed.\nThe country was given a constitution in 1947, and the emperor was abolished.\nThe Japanese Emperor, Hirohito, was abolished in 1947, and the',
 'result': "\nWW2 began with Germany's invasion of Poland in 1939.\nThe war saw many significant events, in