In [1]:
import os
import nltk
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.vectorstores import Chroma
from langchain.text_splitter import TextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/"
file_names = os.listdir(data_path)
file_names

['c12s05.pdf', 'The Making of Iron & Steel.pdf']

## Step 1:

In [4]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_chunking(text, similarity_threshold=0.75):
    sentences = nltk.sent_tokenize(text)
    chunks, current_chunk = [], [sentences[0]]
    
    for i in range(1, len(sentences)):
        similarity = util.cos_sim(
            model.encode(current_chunk[-1], convert_to_tensor=True),
            model.encode(sentences[i], convert_to_tensor=True)
        ).item()
        
        if similarity < similarity_threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])
    
    chunks.append(" ".join(current_chunk))  # Add the final chunk
    return chunks

In [5]:
class SemanticTextSplitter(TextSplitter):
    def __init__(self, similarity_threshold=0.75):
        super().__init__()
        self.similarity_threshold = similarity_threshold
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def split_text(self, text):
        return semantic_chunking(text, self.similarity_threshold)

In [6]:
documents = []
for file in file_names:
    full_file_name = os.path.join(data_path, file)
    loader = PyPDFLoader(full_file_name)
    # Use the custom splitter
    semantic_splitter = SemanticTextSplitter(similarity_threshold=0.75)
    documents += loader.load_and_split(text_splitter=semantic_splitter)


print(len(documents))

1152


## Step 2:

In [7]:
# Initialize the embedding model
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Generate embeddings for your documents
embeddings = [embedding_model.embed_query(doc.page_content) for doc in documents]

  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [8]:
# Create a Chroma DB instance and store embeddings
chroma_db = Chroma.from_documents(documents, embedding_model)

In [9]:
# Test retrieval
query = "which furnace is used to produce the iron"
results = chroma_db.similarity_search(query, k=3)  # Retrieve top 3 relevant chunks
for result in results:
    print(result.page_content.rstrip())
    print("\n")

12.5.1.2 Iron Production -
Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.


The iron is also used for feed in blast furnaces and BOF's when economics allow.


Production 
of iron in the blast furnace is a thermo chemical process, during which the metal is reduced from 
its oxides by a series of chemical reactions and carburised to reduce its melting temperature.




## Step : 3

In [10]:
# Load the model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your chosen model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

# Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [None]:
# Test with a sample query
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs.input_ids, max_length=200, temperature=0.1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

prompt =  "which furnace is used to produce the iron"
response = generate_response(prompt)
print(response)