In [None]:
import os
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SKLearnVectorStore
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFacePipeline
import tqdm as tqdm
import torch
from transformers import AutoModelForCausalLM
import numpy as np

In [None]:
def load_olmo_model():
    device = 'cuda'
    model_id = "/home/hice1/asharma838/scratch/OLMo-7B-0724-hf/" 
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, temperature = 0.2)

In [None]:
olmo = load_olmo_model()

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0)

file_path = "/home/hice1/asharma838/scratch/PDF_Syllabus_Dataset"
def load_documents():
    docs = []
    for file in os.listdir(file_path):
        if file.endswith(".pdf"):
            try:
                loader = PyPDFLoader(os.path.join(file_path, file))
                pdf_docs = loader.load()
                docs.extend(pdf_docs)
                logger.info(f"Loaded document: {file}")
            except Exception as e:
                continue
                logger.error(f"Error loading {file}: {e}")
    #logger.info(f"Total documents loaded: {len(docs)}")
    return docs
docs = load_documents()
doc_splits = text_splitter.split_documents(docs)


## setting up prompt template
prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks based on course content of Georgia Tech ECE department.
    Use the following documents to answer the question.
    Use five sentences maximum and keep the answer concise:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
    input_variables=["question", "documents"],
)

#### initialize the embedding model #### 

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings' : True}
model_norm = HuggingFaceBgeEmbeddings(model_name= model_name,
model_kwargs = {'device' : 'cuda'}, encode_kwargs = encode_kwargs)

#### initializing the vectorstore ####
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=model_norm,
)
retriever = vectorstore.as_retriever(k=3)

In [None]:
# Create an LLM wrapper for your Hugging Face pipeline
llm = HuggingFacePipeline(pipeline=olmo)

In [None]:
# Create the LLMChain
rag_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
class RAGapplication():
    def __init__(self, retriever, rag_chain):
        self.retriever = retriever
        self.rag_chain = rag_chain

    def run(self, question):
        docs = self.retriever.invoke(question)
        docs_texts = "\n".join([str(n.page_content) for n in docs])
        # print(docs_texts)
        input_data = {
            "question": str(question),
            "documents": docs_texts
        }
        answer = self.rag_chain.invoke(input_data)
        return answer

In [None]:
rag_app = RAGapplication(retriever = retriever, rag_chain = rag_chain)

In [None]:
answer = rag_app.run('what is the course content for ECE6254')

In [None]:
ans= answer['text']

In [None]:
def extract_answer(text):
    answer_start = text.find("Answer:")
    if answer_start == -1:
        return "Answer not found."
    answer = text[answer_start + len("Answer:"):].strip()
    return answer
ans_processed = extract_answer(ans)

In [None]:
print(ans_processed, end ='\n')

In [None]:
############## EVALUATING METRICS #####################

import pandas as pd
from datasets import Dataset

# Load the Excel file
file_path = "/home/hice1/asharma838/scratch/Query_Response_Pairs_ConvAI_Project.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)


df = df.rename(columns={"Query": "question", "Response": "answer"})
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and test (if needed)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Convert the dataset to a list of dictionaries with "question" and "answer" fields
questions = test_dataset["question"]
answers = test_dataset["answer"]

In [None]:
import evaluate
import nltk
from nltk.translate.meteor_score import meteor_score

# Download the WordNet corpus and other necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional, for multilingual support

# Load the evaluation metrics
em_metric = evaluate.load("exact_match")
f1_metric = evaluate.load("squad_v2")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [None]:
from tqdm import tqdm

# Initialize lists to store ground truth and predicted answers
ground_truths = []
predictions = []

# Iterate over each example in the dataset
for example in tqdm(test_dataset):
    question = example["question"]
    true_answer = example["answer"]
    
    # Generate answer using the RAG model
    answer = rag_app.run(question)
    generated_answer = extract_answer(answer['text'])
    #generated_answer = generate_answer(question)
    
    # Append to lists for evaluation
    ground_truths.append([true_answer])  # BLEU expects a list of references
    predictions.append(generated_answer)

predictions = [pred.replace("\n", " ").strip() for pred in predictions]
ground_truths = [[ref.replace("\n", " ").strip() for ref in ref_list] for ref_list in ground_truths]
# Calculate Exact Match and F1 Score
em_score = em_metric.compute(predictions=predictions, references=[gt[0] for gt in ground_truths])

# Calculate BLEU
bleu_score = bleu_metric.compute(predictions=predictions, references=ground_truths)

# Calculate ROUGE
rouge_score = rouge_metric.compute(predictions=predictions, references=[gt[0] for gt in ground_truths])

# Calculate METEOR
meteor = meteor_score(ground_truths, predictions)

# Print results
print("Exact Match (EM):", em_score["exact_match"])
print("BLEU Score:", bleu_score["bleu"])
print("ROUGE Score:", rouge_score)  # Contains rouge1, rouge2, and rougeL
print("METEOR Score:", meteor)
