In [1]:
import os
import pandas as pd
import torch
import numpy as np
import faiss
import pickle
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
from torch.nn.functional import normalize
from groq import Groq
# Set environment variable to avoid OpenMP runtime error
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
from semantic_text_splitter import TextSplitter

def load_and_split_texts(folder_path, max_characters=2000):
    splitter = TextSplitter(max_characters)
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='latin') as f:
                text = f.read()
            chunks = splitter.chunks(text)
            for chunk in chunks:
                data.append({'filename': filename, 'chunk': chunk})
    return pd.DataFrame(data)


In [43]:
def create_and_save_embeddings(df, model_id, embeddings_file, index_file,cache_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_id,cache_dir=cache_dir)
    model = AutoModel.from_pretrained(model_id, add_pooling_layer=False,cache_dir=cache_dir)
    model.eval()

    chunk_tokens = tokenizer(df['chunk'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.inference_mode():
        chunk_embeddings = model(**chunk_tokens)[0][:, 0]
    chunk_embeddings = normalize(chunk_embeddings).numpy().astype(np.float32)
    
    # Save embeddings
    with open(embeddings_file, 'wb') as f:
        pickle.dump(chunk_embeddings, f)
    
    # Create and save FAISS index
    d = chunk_embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(chunk_embeddings)
    faiss.write_index(index, index_file)

    return chunk_embeddings, index


In [44]:
def load_embeddings_and_index(embeddings_file, index_file):
    with open(embeddings_file, 'rb') as f:
        chunk_embeddings = pickle.load(f)
    index = faiss.read_index(index_file)
    return chunk_embeddings, index


In [45]:
def retrieve_relevant_chunks(query, model_id, index, df, cache_dir, k=5):
    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
    model = AutoModel.from_pretrained(model_id, add_pooling_layer=False,cache_dir=cache_dir)
    model.eval()
    
    query_tokens = tokenizer([query], padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.inference_mode():
        query_embedding = model(**query_tokens)[0][:, 0]
    query_embedding = normalize(query_embedding).numpy().astype(np.float32)
    
    D, I = index.search(query_embedding, k)
    relevant_chunks = [df.iloc[i] for i in I[0]]
    return relevant_chunks


In [46]:
# def expand_query(query, client):
#     response = client.chat.completions.create(
#         messages=[
#             {
#                 "role": "user",
#                 "content": f"try to create a small, precise and fictional answer for the query that you feel would be closest to structure with the real answer disregarding facts, answer would then be used to perform similarity search with original document to fined real answer./n/n Query: {query}"
#             }
#         ],
#         model="llama3-8b-8192",
#     )
#     return response.choices[0].message.content

In [47]:
def generate_response(query, relevant_chunks, client):
    context = "\n".join([f"From {chunk['filename']}: {chunk['chunk']}" for chunk in relevant_chunks])
    response = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"On the basis of these isolated contexts {context}\n\nAnswer the query: {query} \n\nRemember these are from the policy documents of the organization, consolidate the information and generate a single answer, source does not need to be mentioned and limit the response to 100 words and be concise and precise not creative",
            }
        ],
        model="llama-3.1-8b-instant",
    )
    return response.choices[0].message.content

In [48]:
folder_path = r'D:\Python\worj\rag\foldeer'
model_id = "Snowflake/snowflake-arctic-embed-m-v1.5"
device = "cuda" if torch.cuda.is_available() else "cpu"
embeddings_file = 'embeddings.pkl'
index_file = 'faiss_index.bin'
cache_dir = r"D:\Python\worj\rag\cache_model"

# Load and split texts
df = load_and_split_texts(folder_path)
df.to_csv('new.csv')

In [49]:
# Create and save embeddings and index
create_and_save_embeddings(df, model_id, embeddings_file, index_file, cache_dir)

(array([[ 0.00792541,  0.02521495,  0.05078359, ..., -0.02041577,
         -0.04319262,  0.03923333],
        [ 0.03838221,  0.05746122,  0.04390368, ...,  0.01192289,
         -0.04358421,  0.00834466],
        [ 0.07281397,  0.04514536,  0.025077  , ...,  0.01029745,
         -0.02703535,  0.00187795],
        ...,
        [ 0.00658551,  0.06338469,  0.06647727, ..., -0.00087577,
         -0.06746212,  0.00215646],
        [ 0.07232577,  0.07231921,  0.01632305, ...,  0.03392524,
         -0.0396976 ,  0.00459635],
        [ 0.06511381,  0.02253478,  0.09496327, ...,  0.03786783,
         -0.02904679, -0.00059178]], dtype=float32),
 <faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000022367103C00> >)

In [53]:
query = "when can the full hotel allowance be paid to thee employee"

In [54]:

# Load embeddings and index
chunk_embeddings, index = load_embeddings_and_index(embeddings_file, index_file)

client = Groq(api_key="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")
# expanded_query = expand_query(query, client)
relevant_chunks = retrieve_relevant_chunks(query, model_id, index, df, cache_dir=cache_dir)
context = "\n".join([f"From {chunk['filename']}: {chunk['chunk']}" for chunk in relevant_chunks])
context


'From Travel Reimbursment Policy.txt: Day trip the employee will be paid Daily Allowance as per the following table\n\n\nFull hotel allowance can be avail by the employee only when night stay involves outside of employee Head Quarter.\nHotel expenses indicated are maximum limits including room rent and service charges applicable excluding Goods & Service Taxes. Itâ\x80\x99s is the responsibility of the employee to collect the bills and\n\nsubmit with the travel expenses. In case any lodging bill lost then the employee has to collect duplicate lodging bill to claim. Hotel bills will be reimbursed only if supported by proper bills.\nLocal Conveyance (distribution staff )\n\n\nPetrol bills have to be submitted for claiming local expenses, the employee can take share taxi, cab from uber / Ola etc . The employees are encouraged to use metro/ local train of intra-city travel where ever available\nAll employees using two-wheeler are encouraged to buy helmets and wear helmets while driving the

In [55]:
response = generate_response(query, relevant_chunks, client)
print(response)

The full hotel allowance can be availed by the employee when the night stay involves being outside of their Head Quarter.
