## **1. Install necessary libraries**

In [None]:
%pip install --upgrade --quiet promptlayer langchain-googledrive streamlit fake_useragent tabulate pdf2image pdfminer pdfminer.six PyPDF2 pikepdf html2text unstructured-inference textstat pytesseract openai unstructured dill langchain-openai langchain cohere chromadb tiktoken InstructorEmbedding sentence_transformers==2.2.2 faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m108.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
%pip install --upgrade --quiet  google-api-python-client google-auth-httplib2 google-auth-oauthlib

## **2. Load necessary classes**

In [None]:
import os
import json
import pickle
import requests
import tiktoken
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from textstat import flesch_reading_ease
from collections import Counter

import promptlayer
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.chat_models import PromptLayerChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain_community.vectorstores import Chroma, FAISS
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import GoogleDriveLoader

## **3. Load json files from Google Drive**

In [None]:
from google.colab import drive
from google.colab import auth
drive.mount('/content/drive')
auth.authenticate_user()

with open("/content/drive/MyDrive/Colab Notebooks/necessary_items/credentials.json", "rb") as file:
  credentials_path = json.load(file)

with open("/content/drive/MyDrive/Colab Notebooks/necessary_items/letters.json", "rb") as file:
  letters_array = json.load(file)


### **4.1 Load all PDF letters from Google Drive and Recursiving Character Text Splittering**

In [None]:
all_documents = []

loaders = [GoogleDriveLoader(file_ids=[letters], recursive=False) for letters in letters_array]

print(loaders)

for loader in loaders:
  raw_documents = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 200,
      chunk_overlap = 100
  )

  documents = text_splitter.split_documents(raw_documents)
  all_documents.extend(documents)

## **5. Embedding and Store in FAISS Database**

In [None]:
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_store_path = os.path.exists(os.path.join(os.getcwd(), 'vector_store'))

if vector_store_path == True:
  vectorstore = FAISS.load_local("vector_store", embedding)
else:
  print("Vector Database not found!")

  vectorstore = FAISS.from_documents(all_documents, embedding)
  vectorstore.save_local("vector_store")


## **6. Use ChatAI for Making Answering from PDFs**

In [None]:
from google.colab import userdata

# Get the "Environment Variable" from the Colab Secrets
open_api_key = userdata.get('OPENAI_API_KEY')
organization_id = userdata.get('ORGANIZATION_ID')
prompt_layer_api_key = userdata.get("PROMPTLAYER_API_KEY")
promptlayer.api_key = prompt_layer_api_key
model_name = "gpt-3.5-turbo-0125"

llm = PromptLayerChatOpenAI(
      pl_tags=["langchain"],
      return_pl_id=True,
      api_key=open_api_key,
      model=model_name
)
# chat_result = chat.generate([[HumanMessage("how can we use PaLM 2 API in our langchain based application?")]])
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type="stuff",
    return_source_documents=True
)

query="tell me about the deal in 1997"

result = qa_chain({"query": query})
print(result)