In [None]:
##### RAG NEW VERSION ##############
import os
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SKLearnVectorStore
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community0.document_loaders import PyPDFLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFacePipeline
import re
import torch
from transformers import AutoModelForCausalLM
import numpy as np


In [None]:
import transformers
transformers.__version__

In [None]:
def load_llama_model():
    device = 'cuda'
    model_id = "/home/hice1/dbabu6/scratch/Llama-3.1-8B-Instruct" 
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
    return pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, temperature = 0.2)

In [None]:
LLM = load_llama_model()

In [None]:
torch.cuda.is_available()

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0)

In [None]:
file_path = "/home/hice1/dbabu6/scratch/pdf_files/PDF_Syllabus_Dataset"
def load_documents():
    docs = []
    for file in os.listdir(file_path):
        if file.endswith(".pdf"):
            try:
                loader = PyPDFLoader(os.path.join(file_path, file))
                pdf_docs = loader.load()
                docs.extend(pdf_docs)
                # logger.info(f"Loaded document: {file}")
            except Exception as e:
                continue
                # logger.error(f"Error loading {file}: {e}")
    # logger.info(f"Total documents loaded: {len(docs)}")
    return docs
docs = load_documents()

In [None]:
doc_splits = text_splitter.split_documents(docs)

In [None]:

### setting up the prompt template ###
prompt = PromptTemplate(
    template="""You are an assistant for question-answering tasks based on course content of Georgia Tech ECE department.
    Use the following documents to answer the question.
    Use five sentences maximum and keep the answer concise:
    Question: {question}
    Documents: {documents}
    Answer:
    """,
    input_variables=["question", "documents"],
)

In [None]:
#### initialize the embedding model #### 

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings' : True}
model_norm = HuggingFaceBgeEmbeddings(model_name= model_name,
model_kwargs = {'device' : 'cuda'}, encode_kwargs = encode_kwargs)

#### initializing the vectorstore ####
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=model_norm,
)
retriever = vectorstore.as_retriever(k=3)

In [None]:

# Create an LLM wrapper for your Hugging Face pipeline
llm = HuggingFacePipeline(pipeline=LLM)


In [None]:
# rag_chain = prompt | LLM | StrOutputParser()

In [None]:
# Create the LLMChain
rag_chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
class RAGapplication():
    def __init__(self, retriever, rag_chain):
        self.retriever = retriever
        self.rag_chain = rag_chain

    def run(self, question):
        docs = self.retriever.invoke(question)
        docs_texts = "\n".join([str(n.page_content) for n in docs])
        # print(docs_texts)
        input_data = {
            "question": str(question),
            "documents": docs_texts
        }
        answer = self.rag_chain.invoke(input_data)
        return answer
    

In [None]:
rag_app = RAGapplication(retriever = retriever, rag_chain = rag_chain)

In [None]:
answer = rag_app.run('what is the the course content for ECE 6250')

In [None]:
ans= answer['text']

In [None]:
def extract_answer(text):
    answer_start = text.find("Answer:")
    if answer_start == -1:
        return "Answer not found."
    answer = text[answer_start + len("Answer:"):].strip()
    return answer
ans_processed = extract_answer(ans)


In [None]:
print(ans_processed, end ='\n')

In [None]:
!pip install accelerate peft bitsandbytes trl

In [30]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [32]:
model_id="/home/hice1/dbabu6/scratch/Llama-3.1-8B-Instruct"

In [33]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [34]:
model, tokenizer = get_model_and_tokenizer(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
##### running an inference ######
from transformers import GenerationConfig
from time import perf_counter
def generate_response(user_input):
  prompt = formatted_prompt(user_input)
  print(prompt)
  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=60,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  outputs = model.generate(**inputs, generation_config=generation_config)
  theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [None]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [None]:
generate_response(user_input='What is Georgia Tech? How good it is?')

In [35]:
#### preparing the dataset #####

import pandas as pd
def load_qa_pairs(data_path):
    data = pd.read_csv(data_path, encoding = "latin-1")
    return data

In [36]:
data_path = "/home/hice1/dbabu6/scratch/Query_Response_Pairs_ConvAI_Project(Sheet1) (1).csv"
data = load_qa_pairs(data_path)

In [37]:
training_data = [{"prompt": prompt, "response": response} for prompt, response in zip(data['Query'], data['Response'])]

In [38]:
training_data

[{'prompt': 'What is the main focus of the ECE 8803 HOS (High-dimensional statistics, signal processing, and optimization) course?',
  'response': 'The main focus of ECE 8803 HOS is high-dimensional statistics and optimization. The course introduces students to modern analyses of one-shot and iterative algorithms in high-dimensional statistical learning and signal processing. It covers analytical tools from probability and optimization, analyzes M-estimators and iterative methods in statistical models, and introduces convex relaxation and its analysis'},
 {'prompt': 'Who is the instructor for ECE 8803 HOS?',
  'response': 'The instructor for ECE 8803 HOS is Ashwin Pananjady. He is a faculty member with a joint appointment between ISyE and ECE at Georgia Tech. Ashwin received his Ph.D. in EECS from UC Berkeley and spent a semester as a postdoctoral research fellow at the Simons Institute for the Theory of Computing'},
 {'prompt': 'What are the key learning outcomes of ECE 8803 HOS?',
  

In [44]:
def format_training_data(training_data):
    data_df = pd.DataFrame(training_data)
    data_df['text'] = data_df[['prompt', 'response']].apply(lambda x: "<|im_start|>user\n"+ str(x['prompt']) + " <|im_end|>\n<|im_start|>assistant\n" + str(x['response']) + "<|im_end|>\n", axis = 1)
    data = Dataset.from_pandas(data_df)
    return data


In [45]:
data = format_training_data(training_data)

In [47]:
##### fine tuning ######
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [77]:
from peft import LoraConfig, get_peft_model

In [78]:
?? get_peft_model

[0;31mSignature:[0m
 [0mget_peft_model[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0;34m'PreTrainedModel'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpeft_config[0m[0;34m:[0m [0;34m'PeftConfig'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madapter_name[0m[0;34m:[0m [0;34m'str'[0m [0;34m=[0m [0;34m'default'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmixed[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mautocast_adapter_dtype[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrevision[0m[0;34m:[0m [0;34m'Optional[str]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'PeftModel | PeftMixedModel'[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mget_peft_model[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m:[0m [0mPreTrainedModel[0m[0;34m,

In [48]:
output_model = "/home/hice1/dbabu6/scratch/model_finetuned_results"

In [50]:

training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        learning_rate=3e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=8,
        max_steps=250,
        fp16=True,
        push_to_hub=True,
        report_to = "tensorboard"
    )
     

In [51]:

trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=1024
    )
     


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/496 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [52]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=250, training_loss=0.7656039447784424, metrics={'train_runtime': 340.971, 'train_samples_per_second': 23.462, 'train_steps_per_second': 0.733, 'total_flos': 3.865307613649306e+16, 'train_loss': 0.7656039447784424, 'epoch': 16.129032258064516})

In [53]:
#### running inference #####

In [54]:
model_id = "Dhanush2210/model_finetuned_results"

In [55]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, quantization_config=bnb_config, device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [56]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/673 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [73]:
from transformers import GenerationConfig
from time import perf_counter
def generate_response(user_input):
  prompt = formatted_prompt(user_input)
  # inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.2,repetition_penalty=1.2,
      max_new_tokens=80,pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  outputs = model.generate(**inputs, generation_config=generation_config)
  theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [74]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [75]:
generate_response(user_input= "What is the course's policy on late assignments for ECE 8804 VLSI Design 1?")

<|im_start|>user
What is the course's policy on late assignments for ECE 8804 VLSI Design 1?<|im_end|>
<|im_start|>assistant: In ECE 8804 VLSIS, late homeworks will be strictly penalized. The syllabus assumes that all work is done individually unless explicitly stated otherwise.<|im_end|>
<|im_start|>assistant
If a valid reason for submitting an assignment late is provided, it will be accepted up to one week after submission deadline or two weeks after opening day, whichever comes first
Time taken for inference: 3.54 seconds
