In [14]:
import os

# https://www.sbert.net/docs/pretrained_models.html
transformers_cache = os.environ.get('TRANSFORMERS_CACHE')
print(transformers_cache)

from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1', cache_folder=transformers_cache)
embeddings = model.encode(sentences)
print(embeddings)

model2 = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder=transformers_cache)
embeddings2 = model2.encode(sentences)
print(embeddings2)

None
[[-0.03885623  0.01854845 -0.04066142 ...  0.01009198 -0.0166053
  -0.00138947]
 [-0.000595   -0.00924201 -0.05870512 ...  0.01638777  0.0150957
  -0.04368326]]
[[ 0.02250261 -0.07829171 -0.02303074 ... -0.00827926  0.02652688
  -0.00201898]
 [ 0.04170233  0.00109746 -0.01553419 ... -0.02181631 -0.06359364
  -0.00875286]]


In [5]:
from sentence_transformers import SentenceTransformer, util
sentences = [
    "Yesterday I've played with my cat, I had a pleasant evening", 
    "I really like going to hike with my dog, it's a lot of fun",
    "Swimming with my dog in the pool, really fun evening",
    "I love elasticsearch capabilities to search for similar sentences"]
embeddings = model.encode(sentences)
embeddings2 = model2.encode(sentences)

# Calculate the cosine similarity between the embeddings
similarity_matrix = util.cos_sim(embeddings, embeddings)
similarity_matrix2 = util.cos_sim(embeddings2, embeddings2)

# Print the cosine similarity matrix
print(similarity_matrix)
print(similarity_matrix2)

tensor([[1.0000, 0.3600, 0.4760, 0.1246],
        [0.3600, 1.0000, 0.5832, 0.1339],
        [0.4760, 0.5832, 1.0000, 0.1140],
        [0.1246, 0.1339, 0.1140, 1.0000]])
tensor([[ 1.0000,  0.4586,  0.6155,  0.0243],
        [ 0.4586,  1.0000,  0.5901,  0.1052],
        [ 0.6155,  0.5901,  1.0000, -0.0042],
        [ 0.0243,  0.1052, -0.0042,  1.0000]])


In [3]:
# Verify if you installed torch correctly and your GPU is available
import torch
print(torch.cuda.is_available())

False


In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

# Define the name of the pre-trained transformer model to use
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"

# Define additional arguments to pass to the HuggingFaceEmbeddings constructor
model_kwargs = {'device': 'cuda'} # or 'cpu'
encode_kwargs = {'normalize_embeddings': False}

# Create an instance of the HuggingFaceEmbeddings class using the specified model name and arguments
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

The code above is a Python script that demonstrates how to use the `HuggingFaceEmbeddings` class from the `langchain.embeddings` module to create embeddings for text data using a pre-trained transformer model. 

First, the script imports the `HuggingFaceEmbeddings` class from the `langchain.embeddings` module. This class is used to create embeddings for text data using pre-trained transformer models from the Hugging Face model hub.

Next, the script defines a `model_name` variable that specifies the name of the pre-trained transformer model to use. In this case, the model is `sentence-transformers/distiluse-base-multilingual-cased-v1`, which is a multilingual sentence embedding model based on the DistilBERT architecture.

The script also defines two dictionaries: `model_kwargs` and `encode_kwargs`. These dictionaries are used to pass additional arguments to the `HuggingFaceEmbeddings` constructor. In this case, `model_kwargs` specifies that the model should be loaded onto the GPU if available, and `encode_kwargs` specifies that the embeddings should not be normalized.

Finally, the script creates an instance of the `HuggingFaceEmbeddings` class using the `model_name`, `model_kwargs`, and `encode_kwargs` variables. This instance can then be used to encode text data into embeddings using the `encode` method.

Overall, this code demonstrates how to use the `HuggingFaceEmbeddings` class to create embeddings for text data using a pre-trained transformer model. 



In [10]:
from langchain import ElasticVectorSearch

elastic_vector_search = ElasticVectorSearch(
    elasticsearch_url="http://localhost:9201",
    index_name="test_index",
    embedding=hf
)




ImportError: Could not import elasticsearch python package. Please install it with `pip install elasticsearch`.

In [5]:
import os
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader

# Define the path to the directory containing the PDF files
pdf_dir = 'S:\\OneDrive\\Documentation\\HumbleBundle\\Security apress'

# Create a list to store the loaded PDF documents
pdf_docs = []

# Traverse the directory tree and load the PDF files
for root, dirs, files in os.walk(pdf_dir):
    for file in tqdm(files, desc="Loading PDF files", unit="file"):
        if file.endswith('.pdf'):  
            pdf_path = os.path.join(root, file)
            pdf_loader = PyPDFLoader(pdf_path)
            pdf_doc = pdf_loader.load()
            pdf_docs.append(pdf_doc)

# Print the number of loaded PDF documents
print(f"Loaded {len(pdf_docs)} PDF documents")


Loading PDF files: 100%|██████████| 18/18 [00:55<00:00,  3.09s/file]

Loaded 18 PDF documents





In [6]:
from pprint import pprint

pprint(f"Loaded {len(pdf_docs)} PDF documents")
doc = pdf_docs[0] 
print(f"first doc has {len(doc)} elements")
firstElement = doc[0]
print(f"firstElement is of type {type(firstElement)}")
print(f"firstElement has {firstElement.metadata} medatata")
print(f"firstElement has {firstElement.page_content} page content")

total_pages = 0
for doc in pdf_docs:
    total_pages += len(doc)
    # Count the number of pages that have more than 1000 characters
    num_long_pages = 0
    for doc in pdf_docs:
        for page in doc:
            if len(page.page_content) > 1000:
                num_long_pages += 1

print(f"Number of pages with more than 1000 characters: {num_long_pages}")
print(f"Total number of pages: {total_pages}")


'Loaded 18 PDF documents'
first doc has 239 elements
firstElement is of type <class 'langchain.schema.Document'>
firstElement has {'source': 'S:\\OneDrive\\Documentation\\HumbleBundle\\Security apress\\appliedcryptographyinnetandazurekeyvault.pdf', 'page': 0} medatata
firstElement has Applied 
Cryptography in 
.NET and Azure  
Key Vault
A Practical Guide to Encryption in  
.NET and .NET Core
—
Stephen Haunts
Foreword by Troy Hunt page content
Number of pages with more than 1000 characters: 4215
Total number of pages: 6120


In [8]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator = " ", chunk_size=1000, chunk_overlap=0)

number_of_docs = 0
for doc in pdf_docs:
    docs = text_splitter.split_documents(doc)
    number_of_docs += len(docs)
    # print(f"for document {doc[0].metadata} we have {len(docs)} embeddings from a total of {len(doc)} pages")
    db = elastic_vector_search.add_documents(docs)
    # for chunk in text_splitter.split_documents(doc):
    #     splitting.append(chunk)

pprint(f"total number of embedded documents is {number_of_docs}")

  version_num = client.info()["version"]["number"][0]
  client.indices.create(index=index_name, body={"mappings": mapping})
  bulk(self.client, requests)
  self.client.indices.refresh(index=self.index_name)
  self.client.indices.get(index=self.index_name)
Created a chunk of size 1206, which is longer than the specified 1000


'total number of embedded documents is 12257'


In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator = " ",chunk_size=400, chunk_overlap=0)
first_doc = pdf_docs[0] 

print(f"first doc has {len(first_doc)} pages")
page = first_doc[50]
print(f"page number 50 has {len(page.page_content)} characters")
#pprint(page.page_content)

page_chunks = text_splitter.split_text(page.page_content)
print(f"page has {len(page_chunks)} chunks")
pprint(page_chunks)

# chunks = text_splitter.split_documents(first_doc)
# print(f"document has {len(chunks)} chunks")
