In [52]:
#!pip install transformers peft bitsandbytes accelerate pandas openpyxl datasets odfpy huggingface_hub

In [5]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding, AutoConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import Dataset
from transformers import TrainingArguments, Trainer

In [6]:
#################################################################
# Tokenizer
#################################################################

model_name='Mistral-7B-Instruct-v0.1'

model_config = AutoConfig.from_pretrained(
    model_name,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [7]:
#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [8]:
#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [9]:
#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
inputs_not_chat = tokenizer.encode_plus("[INST] Tell me about Georgia Tech ECE coursework [/INST]", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [13]:
print(decoded)

["<s> [INST] Tell me about Georgia Tech ECE coursework [/INST] Georgia Tech's Electrical and Computer Engineering (ECE) is a highly regarded program that offers a comprehensive curriculum in the fields of electrical and electronic systems. The ECE program covers everything from fundamental principles of electricity and magnetism to advanced topics in digital electronics, semiconductor devices, and computer networks.\n\nHere are some of the key areas of focus in Georgia Tech's ECE program:\n\n1. Electrical circuits and systems: This area covers topics such as voltage and current in circuits, circuit components, energy storage, and power systems.\n2. Signal processing: This area provides an introduction to the analysis and manipulation of digital signals, including filtering, Fourier analysis, and digital signal processing algorithms.\n3. Digital systems: This area covers topics such as binary logic and arithmetic, digital circuit design, and computer architecture.\n4. Communications and

In [14]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


In [16]:
!pip install langchain

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting anyio (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain)
  Downloading anyio-4.6.2.post1-py3-none-any.whl.metadata (4.7 kB)
Collecting sniffio (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.10->langchain)
  Downloading jsonpointer-3.0.0-py2.py3-none-any.whl.metadata (2.3 kB)
Downloading jsonpointer-3.0.0-py2.py3-none-any.whl (7.6 kB)
Downloading anyio-4.6.2.post1-py3-none-any.whl (90 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Installing collected packages: sniffio, jsonpointer, anyio
Successfully installed anyio-4.6.2.post1 js

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import os

# List of PDF file paths
pdf_files = [
    "/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/2023-Spring-ECE8893-syllabus.pdf",
    "/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/2023Spring-ECE8803ALT-Syllabus.pdf",
    "/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/2024-Spring-ECE8803_BFA-syllabus.pdf",
    "/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/2024Fall-ECE4803-8803EV-Syllabus.pdf",
    "/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/ECE8803_WPS_Syllabus.pdf"
]

# Load and process PDF files
documents = []
for pdf_path in pdf_files:
    loader = PyPDFLoader(pdf_path)
    documents.extend(loader.load())

# Chunk text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, 
                                               chunk_overlap=150,
                                               length_function=len,
                                               separators=["\n\n", "\n", " ", ""])
chunked_documents = text_splitter.split_documents(documents)

# Load chunked documents into the FAISS index
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
db = FAISS.from_documents(chunked_documents, embeddings)

# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="mmr",  # Maximum Marginal Relevance
    search_kwargs={'k': 5, 'fetch_k': 20}
)

# Optional: Save the FAISS index for later use
db.save_local("FAISS_Vector_Database_ECE_Course_Syllabus")

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong poin

In [34]:
# Example query
query = "Who is the course instructor for Parallel Programming for FPGAs?"
results = retriever.get_relevant_documents(query)

# Print results
for i, doc in enumerate(results):
    print(f"Result {i+1}:")
    print(doc.page_content)
    print("---")

Result 1:
improving the design quality; and understand future trends and opportunities for FPGAs for a diverse range of applications such as GNNs, scientific computing, medical electronics, cybersecurity systems, and wireless communications.  Course Structure The course will involve a mix of lectures interspersed with heavy paper reading and discussions. A semester long programming-heavy project will focus on developing an FPGA accelerator using HLS for DNN or GNN algorithms.  Course Text The material for this course will be derived from the following texts: 1. Kastner, Ryan, Janarbek Matai, and Stephen Neuendorffer. "Parallel programming for FPGAs." arXiv preprint arXiv:1805.03648 (2018).
---
Result 2:
handled by the office of student affairs. Students will have to do all assignments individually unless explicitly told otherwise. Students may discuss with classmates but may not copy any solution (or any part of a solution).
---
Result 3:
important to strive for an atmosphere of mutual

In [42]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.chains import LLMChain

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.3,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=300,
)

prompt_template = """
### [INST] 
Instruction: Answer the question based on your 
ECE Coursework knowledge. If you don't know the answer to a question, say "I don't know". Here is the context to help:

{context}

### QUESTION:
{question} 

[/INST]
 """

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [43]:
llm_chain.invoke({"context":"", 
                  "question": "Who is the course instructor for ECE Advanced Logic Transistor course from Georgia Tech?"})



{'context': '',
 'question': 'Who is the course instructor for ECE Advanced Logic Transistor course from Georgia Tech?',
 'text': '\n### [INST] \nInstruction: Answer the question based on your \nECE Coursework knowledge. If you don\'t know the answer to a question, say "I don\'t know". Here is the context to help:\n\n\n\n### QUESTION:\nWho is the course instructor for ECE Advanced Logic Transistor course from Georgia Tech? \n\n[/INST]\n  I don\'t have access to current information about specific courses or instructors at Georgia Tech. However, you can check the official Georgia Tech website or contact their academic department for the most up-to-date information.'}

In [44]:
from langchain.schema.runnable import RunnablePassthrough
query = "Who is the course instructor for ECE Advanced Logic Transistor course from Georgia Tech??" 

retriever = db.as_retriever()

rag_chain = ( 
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

rag_chain.invoke(query)



{'context': [Document(metadata={'source': '/home/hice1/bmallya3/scratch/ECE_Special_Topics_Syllabus_PDFs/2023Spring-ECE8803ALT-Syllabus.pdf', 'page': 0}, page_content='ECE  8803 ALT  Advanced Logic Transistor: Physics and Technology (Spring  2023) \nUnit: 3 \nTime: Mon , Wed 1400- 1515   \nLocation: 2456  Klaus Advanced Comput ing  \nInstructor: Dr. Suman Datta , Joseph M Pettit Chair in Advanced Computing and Professor of \nElectrical  and Computer  Engineering, Georgia Institute of Technology   \nOffice:  Klaus 2360  \nEmail: sdatta68 @ece.gatech.edu  \nPhone:  404-894-6738  \nOffice Hour s: Wed 1 700-1850 \nCourse Objectives:  \nTo develop fundamental understanding of  scattering limited transport versus ballistic tran sport of \ncarriers in modern semiconductor devices, to quantitively analyze the electrostatic robustness in \nultra-scaled MOS transistors and to introduce graduate students specializing in semiconductor \nscience and technology  to the process of identifying the  me

In [4]:
# df = pd.read_csv("Query_Response_Pairs_ConvAI_Project.csv", encoding='latin-1')

# def prepare_data(row):
#     return f"### Human: {row['Query']}\n\n### Assistant: {row['Response']}\n\n"

# # Create a new DataFrame with the prepared data
# prepared_df = pd.DataFrame({
#     'text': df.apply(prepare_data, axis=1)
# })

# # Create the dataset
# dataset = Dataset.from_pandas(prepared_df)

# def tokenize_function(examples):
#     return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/584 [00:00<?, ? examples/s]

In [5]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True,
# )

In [6]:
# model = AutoModelForCausalLM.from_pretrained("Mistral-7B-v0.1",quantization_config=bnb_config,
#     device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# model = prepare_model_for_kbit_training(model)

In [8]:
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

In [9]:
# model = get_peft_model(model, lora_config)

In [10]:
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     warmup_steps=100,
#     logging_dir="./logs",
#     logging_steps=10,
#     save_strategy="epoch",
#     learning_rate=2e-4,
#     fp16=True,  # Enable mixed precision training
#     remove_unused_columns=False,
#     no_cuda=False,  # Ensure CUDA is used if available
# )

In [11]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("Mistral-7B-v0.1")
# tokenizer.pad_token = tokenizer.eos_token

# def tokenize_function(examples):
#     return tokenizer(
#         examples["text"],
#         padding="max_length",
#         truncation=True,
#         max_length=512,
#         return_tensors="pt"
#     )

# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/584 [00:00<?, ? examples/s]

In [12]:
# from dataclasses import dataclass
# from transformers.tokenization_utils_base import PreTrainedTokenizerBase
# from typing import Any, Dict, List, Union, Tuple
# import torch

# @dataclass
# class CustomDataCollatorForLanguageModeling:
#     tokenizer: PreTrainedTokenizerBase
#     mlm: bool = False
#     mlm_probability: float = 0.15

#     def __call__(self, examples: List[Dict[str, Union[List[int], Any]]]) -> Dict[str, torch.Tensor]:
#         batch = self.tokenizer.pad(examples, return_tensors="pt")
        
#         if "label" in batch:
#             batch["labels"] = batch["label"]
#             del batch["label"]
#         elif "labels" not in batch:
#             batch["labels"] = batch["input_ids"].clone()

#         if self.mlm:
#             batch["input_ids"], batch["labels"] = self.mask_tokens(batch["input_ids"])

#         return batch

#     def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
#         # Implement masking logic here if needed
#         return inputs, inputs.clone()

# data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer)

In [13]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     data_collator=data_collator,
# )

In [14]:
# trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,9.0713
20,6.0779
30,0.9622
40,0.4114
50,0.2675
60,0.2303
70,0.1809
80,0.1868
90,0.1607
100,0.1502


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=108, training_loss=1.6492347695209362, metrics={'train_runtime': 209.9344, 'train_samples_per_second': 8.345, 'train_steps_per_second': 0.514, 'total_flos': 3.778252262866944e+16, 'train_loss': 1.6492347695209362, 'epoch': 2.958904109589041})

In [15]:
# model.save_pretrained("./fine_tuned_mistral")

In [16]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import PeftModel, PeftConfig

# # Load the fine-tuned model configuration
# config = PeftConfig.from_pretrained("./fine_tuned_mistral")

# # Load the tokenizer and set the pad token
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# tokenizer.pad_token = tokenizer.eos_token

# # Load the model
# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
# model = PeftModel.from_pretrained(model, "./fine_tuned_mistral")

# # Set the pad token ID in the model config
# model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
# import torch

# def generate_response(query):
#     input_text = f"### Human: {query}\n\n### Assistant:"
#     inputs = tokenizer(input_text, return_tensors="pt")
    
#     # Move inputs to GPU if available
#     if torch.cuda.is_available():
#         inputs = {k: v.to('cuda') for k, v in inputs.items()}
#         model.to('cuda')
    
#     try:
#         with torch.no_grad():
#             outputs = model.generate(
#                 **inputs,
#                 max_length=200,
#                 num_return_sequences=1,
#                 do_sample=True,
#                 temperature=0.7,
#                 top_p=0.9
#             )
#         return tokenizer.decode(outputs[0], skip_special_tokens=True)
#     except Exception as e:
#         return f"An error occurred: {str(e)}"

# # Example usage
# query = "What are the key learning outcomes of ECE 8803 HOS?"
# response = generate_response(query)
# print(response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Human: What are the key learning outcomes of ECE 8803 HOS?

### Assistant: After completing ECE 8803 HOS, students will be able to:
1. Describe the fundamental concepts of heterogeneous computing systems, including multicore, GPU, and FPGA architectures.
2. Design and implement high-performance heterogeneous computing systems using CUDA, OpenCL, or other relevant programming frameworks.
3. Analyze and optimize performance and energy efficiency of heterogeneous computing systems using profiling and debugging tools.
4. Evaluate the trade-offs between performance, energy efficiency, and power consumption in heterogeneous computing systems.




















































In [123]:
print(response)

In [125]:
print(model)