new experiment after faiss fails to index the embeddings

In [25]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize


In [26]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  


[nltk_data] Downloading package wordnet to /home/abhay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/abhay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/abhay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:

def load_single_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()


def split_text_into_qa_pairs(text):
    qa_pairs = text.split("\n\n")
    return qa_pairs


file_path = "/home/abhay/my_projects/ps2_chatbot/faculty_info/faculty_info_iet/iet_qa/all_faculty.txt" 
text_data = load_single_file(file_path)
qa_pairs = split_text_into_qa_pairs(text_data)

print(f"Extracted {len(qa_pairs)} Q&A pairs from the text.")


Extracted 105 Q&A pairs from the text.


In [28]:
model = SentenceTransformer('all-MiniLM-L6-v2')


def generate_qa_embeddings(qa_pairs, model):
    embeddings = model.encode(qa_pairs)
    return embeddings


qa_embeddings = generate_qa_embeddings(qa_pairs, model)


def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]  
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))  
    return index


index = create_faiss_index(qa_embeddings)
print("FAISS index created.")


FAISS index created.


In [44]:

def expand_query_with_synonyms_limited(query):
    expanded_query = set([query])
    words = word_tokenize(query)
    tagged_words = pos_tag(words)

    for word, tag in tagged_words:
        if tag.startswith('N') or tag.startswith('V'): 
            for syn in wordnet.synsets(word):
                for lemma in syn.lemmas():
                    expanded_query.add(lemma.name().replace('_', ' '))
    
    return " ".join(expanded_query)


query = "rabia kamra"
expanded_query = expand_query_with_synonyms_limited(query)
print(f"Expanded Query: {expanded_query}")


Expanded Query: rabia kamra


In [46]:

def search_faiss_index(query, index, qa_pairs, model, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    relevant_qas = [qa_pairs[i] for i in indices[0]]  
    return relevant_qas


relevant_qas = search_faiss_index(expanded_query, index, qa_pairs, model, top_k=5)


print(f"Top relevant Q&A pairs:\n{'-'*40}")
for qa in relevant_qas:
    print(f"{qa}\n{'-'*40}")


Top relevant Q&A pairs:
----------------------------------------
Q: Who is Dr. Rabia Kamra at JKLU?
A: Dr. Rabia Kamra is an Assistant Professor in the Department of Science and Liberal Arts at JKLU. She holds a PhD from IIT Delhi and specializes in designing parallel algorithms for large linear systems.
----------------------------------------
Mathematics - Dr. Rabia Kamra
----------------------------------------
Q: What are Dr. Rabia Kamra's research interests?
A: Numerical Analysis and Parallel Computing.
----------------------------------------
Q: What are Dr. Rabia Kamra's teaching interests?
A: Numerical Methods, Calculus, Ordinary Differential Equations, and Matrix Computations.
----------------------------------------
Q: Who is Dr. Surbhi Chhabra at JKLU?
A: Dr. Surbhi Chhabra is an Assistant Professor of Electronics and Communication Engineering at JKLU. She holds a PhD in Hardware Security from The LNM Institute of Information Technology, Jaipur.
-----------------------------

In [47]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-large')


def generate_summary_with_t5(context, query, t5_tokenizer, t5_model):
    input_text = f"question: {query} context: {context}"
    inputs = t5_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
    
    summary_ids = t5_model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


context = " ".join(relevant_qas)


response = generate_summary_with_t5(context, query, t5_tokenizer, t5_model)
print(f"Generated Response:\n{response}")


Generated Response:
Dr. Rabia Kamra is an Assistant Professor in the Department of Science and Liberal Arts at JKLU. She holds a PhD from IIT Delhi and specializes in designing parallel algorithms for large linear systems. Mathematical Methods
