In [12]:
import os
import nltk
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.vectorstores import Chroma
from langchain.text_splitter import TextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
data_path = "../data/"
file_names = os.listdir(data_path)
file_names

['c12s05.pdf', 'The Making of Iron & Steel.pdf']

## Step 1:

In [4]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_chunking(text, similarity_threshold=0.75):
    sentences = nltk.sent_tokenize(text)
    chunks, current_chunk = [], [sentences[0]]
    
    for i in range(1, len(sentences)):
        similarity = util.cos_sim(
            model.encode(current_chunk[-1], convert_to_tensor=True),
            model.encode(sentences[i], convert_to_tensor=True)
        ).item()
        
        if similarity < similarity_threshold:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])
    
    chunks.append(" ".join(current_chunk))  # Add the final chunk
    return chunks

In [5]:
class SemanticTextSplitter(TextSplitter):
    def __init__(self, similarity_threshold=0.75):
        super().__init__()
        self.similarity_threshold = similarity_threshold
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def split_text(self, text):
        return semantic_chunking(text, self.similarity_threshold)

In [6]:
documents = []
for file in file_names:
    full_file_name = os.path.join(data_path, file)
    loader = PyPDFLoader(full_file_name)
    # Use the custom splitter
    semantic_splitter = SemanticTextSplitter(similarity_threshold=0.75)
    documents += loader.load_and_split(text_splitter=semantic_splitter)


print(len(documents))

1152


## Step 2:

In [7]:
# Initialize the embedding model
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Generate embeddings for your documents
embeddings = [embedding_model.embed_query(doc.page_content) for doc in documents]

  embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


In [8]:
# Create a Chroma DB instance and store embeddings
chroma_db = Chroma.from_documents(documents, embedding_model)

In [9]:
# Test retrieval
query = "which furnace is used to produce the iron"
results = chroma_db.similarity_search(query, k=3)  # Retrieve top 3 relevant chunks
for result in results:
    print(result.page_content.rstrip())
    print("\n")

12.5.1.2 Iron Production -
Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.


The iron is also used for feed in blast furnaces and BOF's when economics allow.


Production 
of iron in the blast furnace is a thermo chemical process, during which the metal is reduced from 
its oxides by a series of chemical reactions and carburised to reduce its melting temperature.




## Step : 3

In [10]:
# Load the model and tokenizer
# model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your chosen model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

# Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [None]:
# # Test with a sample query
# def generate_response(prompt):
#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
#     outputs = model.generate(inputs.input_ids, max_length=200, temperature=0.1)
#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return response

# prompt =  "which furnace is used to produce the iron"
# response = generate_response(prompt)
# print(response)

## Step 4:

In [15]:
# Choose a model
model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with your choice
# quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)
# Load tokenizer and quantized model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto", 
    torch_dtype="auto", 
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [16]:
# Use a pre-trained embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Store embeddings in ChromaDB
vectorstore = Chroma.from_documents(documents, embedding_model)

In [17]:
def retrieve_relevant_chunks(query):
    return vectorstore.similarity_search(query, k=3)  # Retrieve top 5 chunks


In [18]:
def generate_answer(query):
    # Retrieve relevant chunks
    docs = retrieve_relevant_chunks(query)
    context = "\n\n".join([doc.page_content for doc in docs])
    
    # Combine context with user query
    prompt = f"Answer the question based on the following context:\n{context}\n\nQuestion: {query}\nAnswer:"
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs.input_ids, max_length=200, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "which furnace is used to produce the iron?"
answer = generate_answer(query)
print(answer)


Answer the question based on the following context:
12.5.1.2 Iron Production -
Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.

12.5.1.2 Iron Production -
Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.

The iron is also used for feed in blast furnaces and BOF's when economics allow.

Question: which furnace is used to produce the iron?
Answer: Blast Furnace.

Explanation: The process of producing iron in a blast furnace involves the reduction of iron bearing materials with a hot gas, resulting in the production of pig iron. The pig iron is then used as feed in blast furnaces and Basic Oxygen Furnaces (BOFs) when economics allow. Therefore, the answer is


## Evaluation

In [32]:
ground_truth = {
    "which furnace is used to produce the iron?": ["Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.",
                                                  "The iron is also used for feed in blast furnaces and BOF's when economics allow."]}

In [33]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_and_retrieved = set(retrieved_k) & set(relevant_docs)
    return len(relevant_and_retrieved) / k

def recall_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_and_retrieved = set(retrieved_k) & set(relevant_docs)
    return len(relevant_and_retrieved) / len(relevant_docs)

# Example
retrieved_docs = ["12.5.1.2 Iron Production - Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.", 
                  "12.5.1.2 Iron Production -Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas.", "The iron is also used for feed in blast furnaces and BOF's when economics allow."]
relevant_docs = ground_truth["which furnace is used to produce the iron?"]
k = 3

precision = precision_at_k(retrieved_docs, relevant_docs, k)
recall = recall_at_k(retrieved_docs, relevant_docs, k)
print(f"Precision@{k}: {precision}")
print(f"Recall@{k}: {recall}")


Precision@3: 0.3333333333333333
Recall@3: 0.5


In [34]:
import numpy as np

def ndcg_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    dcg = sum([1 / np.log2(idx + 2) if doc in relevant_docs else 0 for idx, doc in enumerate(retrieved_k)])
    ideal_dcg = sum([1 / np.log2(idx + 2) for idx in range(min(len(relevant_docs), k))])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0

# Example
ndcg = ndcg_at_k(retrieved_docs, relevant_docs, k)
print(f"nDCG@{k}: {ndcg}")


nDCG@3: 0.3065735963827292


In [None]:
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
from bert_score import score as bert_score

# Ground truth and generated response
reference = "Iron is produced in blast furnaces by the reduction of iron bearing materials with a hot gas."
generated = "Blast Furnace."

# ROUGE
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = rouge.score(reference, generated)
print(f"ROUGE Scores: {rouge_scores}")

# BLEU
bleu = BLEU()
bleu_score = bleu.corpus_score([generated], [[reference]])
print(f"BLEU Score: {bleu_score.score}")

# BERTScore
P, R, F1 = bert_score([generated], [reference], lang="en")
print(f"BERTScore: P={P.mean().item()}, R={R.mean().item()}, F1={F1.mean().item()}")


ROUGE Scores: {'rouge1': Score(precision=1.0, recall=0.11764705882352941, fmeasure=0.21052631578947367), 'rouge2': Score(precision=1.0, recall=0.0625, fmeasure=0.11764705882352941), 'rougeL': Score(precision=1.0, recall=0.11764705882352941, fmeasure=0.21052631578947367)}
BLEU Score: 0.0
