<div style="display: flex; justify-content: center; align-items: center;">
    <a href="https://colab.research.google.com/github/acemi159/AIwithCemAkgul/blob/main/AskQuestionToYourFiles.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" width="351" height="60" alt="Open In Colab"/></a>
</div>


#**Let me ask questions to my files!! 😎**

In [None]:
# @title **STEP 1:** Setup 1 - Only run once. Once executed, restart the session (Runtime -> Restart Session) { display-mode: "form" }
!pip install -q -q -q langchain-openai langchain faiss-cpu pymupdf ipywidgets
#openai tiktoken Pyarrow


---



In [None]:
# @title **STEP 2:** Setup 2 - Only run once { display-mode: "form" }
from langchain_openai import ChatOpenAI
from langchain.document_loaders import PyMuPDFLoader, TextLoader, Docx2txtLoader
from langchain.callbacks import get_openai_callback

from langchain_openai import ChatOpenAI
from langchain.indexes import VectorstoreIndexCreator
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

from google.colab import files
import ipywidgets as widgets
import os

loaded_files = []
TOTAL_COST = 0.0

# Displaying cost
def UpdateCostDisplay():
  #with cost_display:
    #cost_display.clear_output()
    #print(f"Total cost : $ {TOTAL_COST}")
  cost_display.value = f"Total cost : $ {TOTAL_COST}"

# Prepare File Loaders to later pass them to Vectorstore
def PrepareFileLoaders(files_list):
  if len(files_list) == 0:
    raise FileNotFoundError("No files uploaded! Please go back to the step to upload files.")
  document_loaders = []

  for filepath in files_list:
    if filepath.endswith('.pdf'):
      pdfloader = PyMuPDFLoader(filepath)
      document_loaders.append(pdfloader)
    elif filepath.endswith('.docx') or filepath.endswith('.doc'):
      docloader = Docx2txtLoader(filepath)
      document_loaders.append(docloader)
    elif filepath.endswith('.txt'):
      txtloader = TextLoader(filepath)
      document_loaders.append(txtloader)
    else:
      print(f"App only supports pdf/doc/docx/txt files...\nSkipping file : {filepath}")

  print(f"# of Files= {len(document_loaders)} ---- files : {document_loaders}")
  return document_loaders

# Prepare the Vectorstore with uploaded files
def CreateVectorStore(document_loaders:list =[], openai_api_key:str="", chunk_size=2000, chunk_overlap=50):
  index = None

  if document_loaders:
    with get_openai_callback() as cb:
      # Create the vector store index
      index = VectorstoreIndexCreator(
          # Split the documents into chunks
          text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
          # Embedding engine -> Transform words into their corresponding values
          embedding= OpenAIEmbeddings(openai_api_key=openai_api_key),
          # Define vectorstore to keep files data
          vectorstore_cls=FAISS
      ).from_loaders(document_loaders)

      print(f"Calculating embeddings for file cost : {cb.total_cost}")
      TOTAL_COST += cb.total_cost
      UpdateCostDisplay()
  else:
    raise ValueError("No files uploaded! Please upload at least one file (pdf/doc/docx/txt) to proceed...")

  return index

#Prepare the language mmodel with respect to user selection
def PrepareLLM(gpt_model):
  llm = ChatOpenAI(model=gpt_model)

  return llm

# Generate response to user question/prompt
def GenerateResponse(index, gpt_model, prompt, chain_type):
  llm = PrepareLLM(gpt_model)

  with get_openai_callback() as cb:
    response = index.query(llm=llm, question=prompt, chain_type=chain_type)

    print(f"Answering to this question cost : {cb.total_cost}")
    TOTAL_COST += cb.total_cost
    UpdateCostDisplay()

  return response

#cost_display = widgets.Output(layout=widgets.Layout(border="solid 3px", width='200px', height='200px', align_content='flex-end', font_weight='bold'), style=widgets.Style(font_size=20))
cost_display = widgets.Text(
    value=f"Total Cost : $ {TOTAL_COST}",
    placeholder='',
    description='',
    disabled=True,
    layout=widgets.Layout(border='solid 3px')
)

display(cost_display)
UpdateCostDisplay()




---



In [None]:
# @title **STEP 3:** Upload your files { display-mode: "form" }
uploaded = files.upload()

loaded_files += list(uploaded.keys())


---



To find your OpenAI API key;
  - https://platform.openai.com/api-keys

In [None]:
# @title **STEP 4:** Process Your Files (This will cost. Cost depends on how large your file(s) is) { display-mode: "form" }
# @markdown **REQUIRED**
OPENAI_API_KEY = "" # @param {type:"string"}
# @markdown **OPTIONAL**
# @markdown (Change if you want to experiment)
CHUNK_SIZE = 2000 # @param {type:"integer"}
CHUNK_OVERLAP = 50 # @param {type:"integer"}
# some code here

if OPENAI_API_KEY == "":
  raise ValueError("Please put your OpenAI API key and rerun this cell...")
else:
  # Create file loaders
  file_loaders = PrepareFileLoaders(files_list=loaded_files)
  # Create Vectorstore with file embeddings and file loaders
  vectorstore_index = CreateVectorStore(file_loaders, OPENAI_API_KEY, CHUNK_SIZE, CHUNK_OVERLAP)




---



  * **GPT_MODEL**: Select the model. gpt-3.5 will be cheaper but less accurate then gpt-4, Start with gpt-3.5 and if you are not satisfied with result, try gpt-4.
  * **QUESTION**: Here write your question that you want to ask to your files. The more precise you ask your questions, the higher chance that you will get the answer you are seeking.

In [None]:
# @title **STEP 5:** Ask Your Question { display-mode: "form" }
# @markdown ##**REQUIRED**
GPT_MODEL = "gpt-3.5" # @param ["gpt-4", "gpt-3.5"]
LOGIC = "Summarize all texts and use summary to answer the question" # @param ["Summarize all texts and use summary to answer the question", "Update the answer to your question by each text"]
QUESTION = "" # @param {type:"string"}

# Define chain type to use
Logic_parser = {"Summarize all texts and use summary to answer the question" : "map_reduce",
                "Update the answer to your question by each text" : "refine"}
CHAIN_TYPE = Logic_parser[LOGIC]

# Check user prompt
if QUESTION:
  response = GenerateResponse(index=vectorstore_index,
                              gpt_model=GPT_MODEL,
                              prompt=QUESTION,
                              chain_type=CHAIN_TYPE)
  print(response)
else:
  print("Please enter a question and run this cell again...")



