In [None]:
import os
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SKLearnVectorStore
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFacePipeline
import re
import torch
from transformers import AutoModelForCausalLM
import numpy as np

In [None]:
def load_mistral_model():
    device = 'cuda'
    model_id = "./scratch/Mistral-7B-Instruct-v0.1" 
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, temperature = 0.2)

In [None]:
mistral_model = load_mistral_model()

In [None]:
!pip install tiktoken

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0)

In [None]:
file_path = "./scratch/PDF_Syllabus_Dataset/PDF_Syllabus_Dataset"
def load_documents():
    docs = []
    for file in os.listdir(file_path):
        if file.endswith(".pdf"):
            try:
                loader = PyPDFLoader(os.path.join(file_path, file))
                pdf_docs = loader.load()
                docs.extend(pdf_docs)
                # logger.info(f"Loaded document: {file}")
            except Exception as e:
                continue
                # logger.error(f"Error loading {file}: {e}")
    # logger.info(f"Total documents loaded: {len(docs)}")
    return docs
docs = load_documents()

In [None]:
doc_splits = text_splitter.split_documents(docs)


In [None]:
### setting up the prompt template ###
prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks based on course content of Georgia Tech ECE department.
    Use the following documents to answer the question.
    Use five sentences maximum and keep the answer concise:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
    input_variables=["question", "documents"],
)

In [None]:
#### initialize the embedding model #### 

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings' : True}
model_norm = HuggingFaceBgeEmbeddings(model_name= model_name,
model_kwargs = {'device' : 'cuda'}, encode_kwargs = encode_kwargs)

#### initializing the vectorstore ####
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=model_norm,
)
retriever = vectorstore.as_retriever(k=3)

In [None]:
# Create an LLM wrapper for your Hugging Face pipeline
llm = HuggingFacePipeline(pipeline=mistral_model)

In [None]:
# Create the LLMChain
rag_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
class RAGapplication():
    def __init__(self, retriever, rag_chain):
        self.retriever = retriever
        self.rag_chain = rag_chain

    def run(self, question):
        docs = self.retriever.invoke(question)
        docs_texts = "\n".join([str(n.page_content) for n in docs])
        # print(docs_texts)
        input_data = {
            "question": str(question),
            "documents": docs_texts
        }
        answer = self.rag_chain.invoke(input_data)
        return answer

In [None]:
rag_app = RAGapplication(retriever = retriever, rag_chain = rag_chain)

In [None]:
answer = rag_app.run('what is the the course content for ECE 6250')

In [None]:
ans= answer['text']

In [None]:
def extract_answer(text):
    answer_start = text.find("Answer:")
    if answer_start == -1:
        return "Answer not found."
    answer = text[answer_start + len("Answer:"):].strip()
    return answer
ans_processed = extract_answer(ans)

In [None]:
print(ans_processed, end ='\n')

In [None]:
!pip install accelerate peft bitsandbytes trl

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [None]:
model_id="./scratch/Mistral-7B-Instruct-v0.1"

In [None]:
def get_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    return model, tokenizer

In [None]:
model, tokenizer = get_model_and_tokenizer(model_id)

In [None]:
##### running an inference ######
from transformers import GenerationConfig
from time import perf_counter
def generate_response(user_input):
    prompt = formatted_prompt(user_input)
    print(prompt)
    inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
        top_k=5,temperature=0.5,repetition_penalty=1.2,
        max_new_tokens=60,pad_token_id=tokenizer.eos_token_id
    )
    start_time = perf_counter()
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, generation_config=generation_config)
    theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time,2)} seconds")

In [None]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [None]:
generate_response(user_input='What is Georgia Tech? How good it is?')

In [None]:
import pandas as pd
def load_qa_pairs(data_path):
    data = pd.read_csv(data_path, encoding = "latin-1")
    return data

In [None]:
data_path = "./scratch/Query_Response_Pairs_ConvAI_Project.csv"
data = load_qa_pairs(data_path)

In [None]:
training_data = [{"prompt": prompt, "response": response} for prompt, response in zip(data['Query'], data['Response'])]

In [None]:
training_data[:5]

In [None]:
def format_training_data(training_data):
    data_df = pd.DataFrame(training_data)
    data_df['text'] = data_df[['prompt', 'response']].apply(lambda x: "<|im_start|>user\n"+ str(x['prompt']) + " <|im_end|>\n<|im_start|>assistant\n" + str(x['response']) + "<|im_end|>\n", axis = 1)
    data = Dataset.from_pandas(data_df)
    return data

In [None]:
data = format_training_data(training_data)

In [None]:
##### fine tuning ######
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [None]:
from peft import LoraConfig, get_peft_model

In [None]:
output_model = "./scratch/model_finetuned_results"

In [None]:
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        learning_rate=3e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=8,
        max_steps=250,
        fp16=True,
        push_to_hub=True,
        report_to = "tensorboard"
    )

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )

In [None]:
trainer.train()

In [None]:
def get_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    
    return model, tokenizer

In [None]:
model, tokenizer = get_model_and_tokenizer(output_model)

In [None]:
from transformers import GenerationConfig
from time import perf_counter
def generate_response(user_input):
    prompt = formatted_prompt(user_input)
    # inputs = tokenizer([prompt], return_tensors="pt")
    generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
        top_k=5,temperature=0.2,repetition_penalty=1.2,
        max_new_tokens=80,pad_token_id=tokenizer.eos_token_id
    )
    start_time = perf_counter()
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    outputs = model.generate(**inputs, generation_config=generation_config)
    theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time,2)} seconds")

In [None]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [None]:
generate_response(user_input= "Who is the course instructor for ECE 8803 ODM?")

In [None]:
generate_response(user_input= "What is the course's policy on late assignments for ECE 8804 VLSI Design 1?")

In [None]:
generate_response(user_input= "Who is the course instructor for ECE 8803 WPS?")

In [None]:
import pandas as pd
from datasets import Dataset

# Load the Excel file
file_path = "./scratch/Query_Response_Pairs_ConvAI_Project.csv"  # Replace with your file path
df = pd.read_csv(file_path,encoding = "latin-1")


df = df.rename(columns={"Query": "question", "Response": "answer"})
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into train and test (if needed)
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Convert the dataset to a list of dictionaries with "question" and "answer" fields
questions = test_dataset["question"]
answers = test_dataset["answer"]

In [None]:
!pip install evaluate

In [None]:
!pip install nltk

In [None]:
!pip install rouge_score

In [None]:
import evaluate
import nltk
from nltk.translate.meteor_score import meteor_score

# Download the WordNet corpus and other necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional, for multilingual support

# Load the evaluation metrics
em_metric = evaluate.load("exact_match")
f1_metric = evaluate.load("squad_v2")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [None]:
from tqdm import tqdm

# Initialize lists to store ground truth and predicted answers
ground_truths = []
predictions = []

# Iterate over each example in the dataset
for example in tqdm(test_dataset):
    question = example["question"]
    true_answer = example["answer"]
    
    # Generate answer using the RAG model
    answer = rag_app.run(question)
    generated_answer = extract_answer(answer['text'])
    #generated_answer = generate_answer(question)
    
    # Append to lists for evaluation
    ground_truths.append([true_answer])  # BLEU expects a list of references
    predictions.append(generated_answer)

predictions = [pred.replace("\n", " ").strip() for pred in predictions]
ground_truths = [[ref.replace("\n", " ").strip() for ref in ref_list] for ref_list in ground_truths]
# Calculate Exact Match and F1 Score
em_score = em_metric.compute(predictions=predictions, references=[gt[0] for gt in ground_truths])

# Calculate BLEU
bleu_score = bleu_metric.compute(predictions=predictions, references=ground_truths)

# Calculate ROUGE
rouge_score = rouge_metric.compute(predictions=predictions, references=[gt[0] for gt in ground_truths])

# Calculate METEOR
meteor = meteor_score(ground_truths, predictions)

# Print results
print("Exact Match (EM):", em_score["exact_match"])
print("BLEU Score:", bleu_score["bleu"])
print("ROUGE Score:", rouge_score)  # Contains rouge1, rouge2, and rougeL
print("METEOR Score:", meteor)