<a href="https://colab.research.google.com/github/arun-prakash-j-k/llmops_handson_tredence/blob/main/wandb_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
!pip install langchain==0.1.0 openai faiss-gpu tiktoken
!pip install wandb
!pip install pypdf
!pip install transformers sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

In [45]:
# store the open ai key in the key vault and retrieve it using the os command
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [46]:
os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

In [47]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# use langchain pdf loader to load a pdf file and split it into chunks
# Chunking
loader = PyPDFLoader("Handbook of Mechanical Engineering Terms-sample.pdf")

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50)
texts = text_splitter.split_documents(documents)

In [48]:
# viewing the first chunk
print(len(texts))
texts[0]

40


Document(page_content='ACCURATE – Without error within tolerances allowed, precise, correct,\nconfirming exactly to standard.\nACHME THREAD – A screw thread having an included angle of 29° and\nlargely used for feed screws on machine tools.\nACUTE ANGLE – An angle which is less than a right angle, 90°.\nADDENDUM – The portion of the tooth of a gear that extends from the\npitch line to the outside.\nALIGN – T o bring two or more components of a unit into correct positions\nwith respect to one another.', metadata={'source': 'Handbook of Mechanical Engineering Terms-sample.pdf', 'page': 0})

In [49]:
# # setup a vector database for the document that we have read
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores.faiss import FAISS
# headers = {"x-api-key": os.environ["OPENAI_API_KEY"]}
# embeddings = OpenAIEmbeddings(model="text-embedding-3-small", headers=headers)
# docsearch = FAISS.from_documents(texts, embeddings)

### getting ratelimiterror with open ai, so trying out with an open source model

In [51]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Initialize the HuggingFace Embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS vector store from the split documents
docsearch = FAISS.from_documents(texts, embedding_model)

In [53]:
# Save the FAISS index to a file
docsearch.save_local("/content") # index.faiss and index.pkl will be created in your local