## OpenAI vs. Local Embeddings
Performance Comparison
- OpenAI's Embedding Model
- InstructorEmbedding found at Huggingface


In [None]:
#!pip -q install langchain openai tiktoken chromadb pypdf sentence_transformers InstructorEmbedding faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m846.5/846.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.3/71.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.8/248.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m66.9 MB/s[0m e

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

In [None]:
# InstructorEmbedding 
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [None]:
# OpenAI Embedding
from langchain.embeddings import OpenAIEmbeddings

### Load Multiple files from Directory

In [None]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


In [None]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader(f'{root_dir}/tesla_earnings/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
# loader = TextLoader('single_text_file.txt')
#loader = DirectoryLoader(f'C:/Users/User/Documents/tesla_earnings/', glob="./*.pdf", loader_cls=PyPDFLoader)
#documents = loader.load()

In [None]:
documents

[Document(page_content='Q1 2023 Update\n1', metadata={'source': '/content/gdrive/My Drive/tesla_earnings/tesla 2023 q1 earnings.pdf', 'page': 0}),
 Document(page_content='Highlights 03\nFinancial Summary 04\nOperational Summary 06\nVehicle Capacity 07\nCore Technology 08\nOther Highlights 09\nOutlook 10\nPhotos & Charts 11\nKey Metrics 19\nFinancial Statements 22\nAdditional Information 28', metadata={'source': '/content/gdrive/My Drive/tesla_earnings/tesla 2023 q1 earnings.pdf', 'page': 1}),
 Document(page_content='S U M M A R Y H I G H L I G H T S  \n(1) Excludes SBC (stock -based compensation).\n(2) Free cash flow = operating cash flow less capex.\n(3) Includes cash, cash equivalents and investments.Profitability 11.4% operating margin in Q1\n$2.7B GAAP operating income in Q1\n$2.5B GAAP net income in Q1\n$2.9B non -GAAP net income1in Q1In the current macroeconomic environment, we see this year as a unique \nopportunity for Tesla. As many carmakers are working through challenges wit

### Divide and Conquer

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000, 
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [None]:
texts[0]

Document(page_content='Q1 2023 Update\n1', metadata={'source': '/content/gdrive/My Drive/tesla_earnings/tesla 2023 q1 earnings.pdf', 'page': 0})

In [None]:
len(texts)

182

### Get Embeddings for OUR Documents

In [None]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [None]:
def store_embeddings(docs, embeddings, sotre_name, path):
    
    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [None]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

### HF Instructor Embeddings

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cuda"})

Downloading (…)7f436/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)0daf57f436/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

Downloading (…)af57f436/config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)7f436/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading (…)f57f436/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [None]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [None]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)

In [None]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})

In [None]:
retriever.search_type

'similarity'

In [None]:
retriever.search_kwargs

{'k': 3}

In [None]:
docs = retriever.get_relevant_documents("Who are the authors of tesla earnings 2022 q3 report?")

In [None]:
docs

[Document(page_content='-4%-2%0%2%4%6%8%10%12%14%16%18%\nQ1-2019\nQ2-2019\nQ3-2019\nQ4-2019\nQ1-2020\nQ2-2020\nQ3-2020\nQ4-2020\nQ1-2021\nQ2-2021\nQ3-2021\nQ4-2021\nQ1-2022\nQ2-2022\nQ3-2022\nQ4-2022\nTesla Autos Industry S&P 500-20%-10%0%10%20%30%40%50%60%70%80%90%\nQ1-2019\nQ2-2019\nQ3-2019\nQ4-2019\nQ1-2020\nQ2-2020\nQ3-2020\nQ4-2020\nQ1-2021\nQ2-2021\nQ3-2021\nQ4-2021\nQ1-2022\nQ2-2022\nQ3-2022\nQ4-2022\nTesla Autos Industry S&P 500\nKEYM E T R I C S T R A I L I N G 1 2M O N T H S ( T T M )\n(Unaudited)\n24YoY Revenue Growth Operating Margin\nSource: OEM financial disclosures, Bloomberg\nAutos Industry includes: Tesla, BMW, Mercedes -Benz, Ford, GM, Honda, Hyundai, Nissan, Toyota and VW. Stellantis is excluded give n limited historical disclosures due to the recent merger between FCA and PSA.\nAutos Industry operating margin is calculated by dividing the sum of USD equivalent operating profits for the entire industry bythe USD equivalent revenues for respective periods.', metadata=

In [None]:
# create the chain to answer questions 
qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.4, ), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

### OpenAI's Embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
# store_embeddings(texts, 
#                  embeddings, 
#                  sotre_name='openAIEmbeddings', 
#                  path=Embedding_store_path)

In [None]:
# db_openAIEmbedd = load_embeddings(sotre_name='openAIEmbeddings', 
#                                     path=Embedding_store_path)

In [None]:
db_openAIEmbedd = FAISS.from_documents(texts, embeddings)
retriever_openai = db_openAIEmbedd.as_retriever(search_kwargs={"k": 3})

In [None]:
# create the chain to answer questions 
qa_chain_openai = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0.2, ), 
                                  chain_type="stuff", 
                                  retriever=retriever_openai, 
                                  return_source_documents=True)

### Testing both MODELS

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = 'what is the top documents discussing the tesla report for both year 2022 and 2023?'

print('-------------------Instructor Embeddings------------------\n')
llm_response = qa_chain_instrucEmbed(query)
process_llm_response(llm_response)

-------------------Instructor Embeddings------------------

 The top document discussing Tesla's financial results for both 2022 and 2023 is the live webcast of the
quarterly financial results conference call.

Sources:
/content/gdrive/My Drive/tesla_earnings/tesla 2022 q2 earnings.pdf
/content/gdrive/My Drive/tesla_earnings/tesla 2023 q1 earnings.pdf
/content/gdrive/My Drive/tesla_earnings/tesla 2022 q4 earnings.pdf


In [None]:
query = 'what is the top documents discussing the tesla report for both year 2022 and 2023 ?'

print('-------------------OpenAI Embeddings------------------')
llm_response = qa_chain_openai(query)
process_llm_response(llm_response)
print('\n\n\n')

-------------------OpenAI Embeddings------------------
 The top documents discussing the Tesla report for both 2022 and 2023 are the Form 10-Q reports filed with the
SEC on April 25, 2022 and October 24, 2022, and the annual report on Form 10-K filed with the SEC on January
31, 2023.

Sources:
/content/gdrive/My Drive/tesla_earnings/tesla 2022 q2 earnings.pdf
/content/gdrive/My Drive/tesla_earnings/tesla 2022 q4 earnings.pdf
/content/gdrive/My Drive/tesla_earnings/tesla 2023 q1 earnings.pdf






In [None]:
#!pip install flask
import flask
from flask import Flask, request

In [None]:
app=Flask(__name__)
@app.route("/result",methods=["POST","GET"])

def result():
  output=request.get_json()

  if len(output.keys()) <2:
    return {"STATUS":"BAD RESPONSE"}

if __name__=='__main__':
  app.run(debug=True,port=2000)

    

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:2000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
app=Flask(__name__)
@app.route("/report",methods=['POST','GET'])

def result():
  query=request.text()
  if len(query)<10:
    return {"STATUS":"BAD RESPONSE"}

  print('-------------------Instructor Embeddings------------------\n')
  llm_response = qa_chain_instrucEmbed(query)
  process_llm_response(llm_response)

  print('-------------------OpenAI Embeddings------------------')
  llm_response = qa_chain_openai(query)
  process_llm_response(llm_response)

if __name__=='__main__':
    app.run(debug=True,port=2000)





 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:2000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
