# Embeddings cache

Demonstrates the use of local filesystem cache for embeddings.

## Setup environment

In [1]:
from dotenv import load_dotenv
import os

import warnings

warnings.filterwarnings("ignore")

# Load the file that contains the API keys
# CHANGE THIS TO YOUR ENV FILE LOCATION
load_dotenv('C:\\Users\\raj\\.jupyter\\.env')

True

## 1. Create the embeddings model instance

In [2]:
# Import the Cohere embeddings model class
# https://docs.cohere.com/reference/embed
from langchain_community.embeddings import CohereEmbeddings

# Create the embeddings model

model_name = "embed-english-light-v3.0"

embeddings = CohereEmbeddings(model=model_name)

corpus = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]


## 2. Create the cached backing embeddings

https://python.langchain.com/docs/modules/data_connection/text_embedding/caching_embeddings

https://api.python.langchain.com/en/stable/embeddings/langchain.embeddings.cache.CacheBackedEmbeddings.html#langchain.embeddings.cache.CacheBackedEmbeddings

In [3]:
from langchain.embeddings import CacheBackedEmbeddings

from langchain.storage import LocalFileStore

In [7]:
# CHANGE THIS TO DESIRED LOCATION OF CACHE ON FILE SYSTEM
cache_dir = "c:/temp/exercise-2/embeddings"

store = LocalFileStore(cache_dir)

cached_embedder = CacheBackedEmbeddings.from_bytes_store(underlying_embeddings  = embeddings, 
                                                         document_embedding_cache =store,
                                                         namespace = "model_name)

In [8]:
corpus_embeddings = cached_embedder.embed_documents(corpus)

## 3. Check storage

In [9]:
# CHANGE THIS TO - LOCATION OF CACHE ON FILE SYSTEM

# Linux/Mac
# !ls <Path to cache>

# Windows
!dir c:\temp\exercise-2\embeddings

 Volume in drive C is Windows
 Volume Serial Number is 64D0-5440

 Directory of c:\temp\exercise-2\embeddings

02/20/2024  09:15 AM    <DIR>          .
02/20/2024  09:15 AM    <DIR>          ..
02/20/2024  09:12 AM             5,070 embed-english-light-v3.021811493-25ee-5fa1-84ab-fbfe08cf0b94
02/20/2024  09:12 AM             5,077 embed-english-light-v3.07833ab0f-a741-506b-b64f-eff2aabca361
02/20/2024  09:12 AM             5,088 embed-english-light-v3.07e75527c-300a-5d9b-b079-0a61f1d669c7
02/20/2024  09:12 AM             5,085 embed-english-light-v3.08200e727-6cc6-5c66-8532-175d1dd29e0c
02/20/2024  09:12 AM             5,096 embed-english-light-v3.0a168f043-b0e3-5936-b8d8-18c6f17a604f
02/20/2024  09:12 AM             5,080 embed-english-light-v3.0b80467e2-c202-52d1-8eac-f06b6b486372
02/20/2024  09:12 AM             5,096 embed-english-light-v3.0dcf068ec-f4be-5d6b-98fb-0ca235c2f086
02/20/2024  09:12 AM             5,063 embed-english-light-v3.0e544bd6e-09cb-5acc-9a59-5a343cac99ce
02/20/

## 4. Subsequent embed_query, embed_documents
1. Run the embeddings generation again, you won't see additional files in the directory
2. Change the namespace (e.g., prefix with "X+") & run the embed_docs again. Check file system folder for cache

In [None]:
# 1. Retrieved from local cache
corpus_embeddings = cached_embedder.embed_documents(corpus)

In [None]:
# 2. Retrieved from model as the namespace is different
corpus_embeddings = cached_embedder.embed_documents(corpus)