In [None]:
import os

transformers_cache = os.environ.get('TRANSFORMERS_CACHE')
print(transformers_cache)

from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
embeddings = model.encode(sentences)
print(embeddings)

In [None]:
# Verify if you installed torch correctly and your GPU is available
import torch
print(torch.cuda.is_available())

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

# Define the name of the pre-trained transformer model to use
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"

# Define additional arguments to pass to the HuggingFaceEmbeddings constructor
model_kwargs = {'device': 'cuda'} # or 'cpu'
encode_kwargs = {'normalize_embeddings': False}

# Create an instance of the HuggingFaceEmbeddings class using the specified model name and arguments
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

The code above is a Python script that demonstrates how to use the `HuggingFaceEmbeddings` class from the `langchain.embeddings` module to create embeddings for text data using a pre-trained transformer model. 

First, the script imports the `HuggingFaceEmbeddings` class from the `langchain.embeddings` module. This class is used to create embeddings for text data using pre-trained transformer models from the Hugging Face model hub.

Next, the script defines a `model_name` variable that specifies the name of the pre-trained transformer model to use. In this case, the model is `sentence-transformers/distiluse-base-multilingual-cased-v1`, which is a multilingual sentence embedding model based on the DistilBERT architecture.

The script also defines two dictionaries: `model_kwargs` and `encode_kwargs`. These dictionaries are used to pass additional arguments to the `HuggingFaceEmbeddings` constructor. In this case, `model_kwargs` specifies that the model should be loaded onto the GPU if available, and `encode_kwargs` specifies that the embeddings should not be normalized.

Finally, the script creates an instance of the `HuggingFaceEmbeddings` class using the `model_name`, `model_kwargs`, and `encode_kwargs` variables. This instance can then be used to encode text data into embeddings using the `encode` method.

Overall, this code demonstrates how to use the `HuggingFaceEmbeddings` class to create embeddings for text data using a pre-trained transformer model. 



In [None]:
from langchain import ElasticVectorSearch

elastic_vector_search = ElasticVectorSearch(
    elasticsearch_url="http://localhost:9201",
    index_name="test_index",
    embedding=hf
)


In [None]:
import os
from tqdm import tqdm
from langchain.document_loaders import PyPDFLoader

# Define the path to the directory containing the PDF files
pdf_dir = 'S:\\OneDrive\\Documentation\\HumbleBundle\\Security apress'

# Create a list to store the loaded PDF documents
pdf_docs = []

# Traverse the directory tree and load the PDF files
for root, dirs, files in os.walk(pdf_dir):
    for file in tqdm(files, desc="Loading PDF files", unit="file"):
        if file.endswith('.pdf'):  
            pdf_path = os.path.join(root, file)
            pdf_loader = PyPDFLoader(pdf_path)
            pdf_doc = pdf_loader.load()
            pdf_docs.append(pdf_doc)

# Print the number of loaded PDF documents
print(f"Loaded {len(pdf_docs)} PDF documents")


In [None]:
from pprint import pprint

pprint(f"Loaded {len(pdf_docs)} PDF documents")
doc = pdf_docs[0] 
print(f"first doc has {len(doc)} elements")
firstElement = doc[0]
print(f"firstElement is of type {type(firstElement)}")
print(f"firstElement has {firstElement.metadata} medatata")
print(f"firstElement has {firstElement.page_content} page content")

total_pages = 0
for doc in pdf_docs:
    total_pages += len(doc)
    # Count the number of pages that have more than 1000 characters
    num_long_pages = 0
    for doc in pdf_docs:
        for page in doc:
            if len(page.page_content) > 1000:
                num_long_pages += 1

print(f"Number of pages with more than 1000 characters: {num_long_pages}")
print(f"Total number of pages: {total_pages}")


In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator = " ", chunk_size=1000, chunk_overlap=0)

number_of_docs = 0
for doc in pdf_docs:
    docs = text_splitter.split_documents(doc)
    number_of_docs += len(docs)
    print(f"for document {doc[0].metadata.source} we have {len(docs)} embeddings from a total of {len(doc)} pages")
    db = elastic_vector_search.add_documents(docs)
    # for chunk in text_splitter.split_documents(doc):
    #     splitting.append(chunk)

pprint(f"total number of embedded documents is {number_of_docs}")

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator = " ",chunk_size=400, chunk_overlap=0)
first_doc = pdf_docs[0] 

print(f"first doc has {len(first_doc)} pages")
page = first_doc[50]
print(f"page number 50 has {len(page.page_content)} characters")
#pprint(page.page_content)

page_chunks = text_splitter.split_text(page.page_content)
print(f"page has {len(page_chunks)} chunks")
pprint(page_chunks)

# chunks = text_splitter.split_documents(first_doc)
# print(f"document has {len(chunks)} chunks")
