<a href="https://colab.research.google.com/github/aswinaus/Quantization/blob/main/Load_Frozen_Model_SemanticIndex_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Summary**: Code in this notebook prepares - local_model.to(device) loads all the model's parameters and buffers to the specified runtime and fine-tunes a large language model for a specific task (likely related to Income tax statistics given the dataset) to improve its performance on that task.

In [None]:
!pip install git+https://github.com/huggingface/transformers torch accelerate langchain langchain_huggingface datasets

Code is essentially forcing Python to always use "UTF-8" as the preferred encoding, regardless of the user's actual system settings. UTF-8 is a widely used encoding that can represent a vast range of characters from different languages. By enforcing UTF-8, you can help ensure that your code works consistently across different platforms and avoids encoding-related errors. It's a common practice for improving compatibility and preventing issues with text handling in Python programs.

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from google.colab import userdata
HUGGING_FACE_TOKEN = userdata.get('HUGGING_FACE_TOKEN')

In [None]:
!huggingface-cli login --token $HUGGING_FACE_TOKEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Download Data
data_dir = '/content/drive/MyDrive'

In [None]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from threading import Thread

The nvidia-smi command is a utility provided by NVIDIA to query and display information about your NVIDIA GPU(s) (Graphics Processing Unit). This includes things like:

GPU model and name
Driver version
GPU utilization
Memory usage
Temperature
Power consumption
Processes running on the GPU

In [None]:
!nvidia-smi

In [None]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [None]:
!pip install autoawq
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

In [7]:
from typing import Tuple, Optional, Union, Dict, Any
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, AutoConfig
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

In [8]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade autoawq transformers

In [10]:
quant_path = f"/{data_dir}/LLMs/Mistral/Mistral-Small-24B-Instruct-2501"

In [11]:
local_model_path = quant_path
local_tokenizer = AutoTokenizer.from_pretrained(quant_path)
local_model = AutoAWQForCausalLM.from_pretrained(quant_path, low_cpu_mem_usage=True)

`torch_dtype` is deprecated! Use `dtype` instead!
You have loaded an AWQ model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU device in order to run your model.
We suggest you to set `dtype=torch.float16` for better efficiency on CUDA/XPU with AWQ.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
local_model.to(device)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(131072, 5120)
    (layers): ModuleList(
      (0-39): 40 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): WQLinear_GEMM(in_features=5120, out_features=4096, bias=False, w_bit=4, group_size=128)
          (k_proj): WQLinear_GEMM(in_features=5120, out_features=1024, bias=False, w_bit=4, group_size=128)
          (v_proj): WQLinear_GEMM(in_features=5120, out_features=1024, bias=False, w_bit=4, group_size=128)
          (o_proj): WQLinear_GEMM(in_features=4096, out_features=5120, bias=False, w_bit=4, group_size=128)
        )
        (mlp): MistralMLP(
          (gate_proj): WQLinear_GEMM(in_features=5120, out_features=32768, bias=False, w_bit=4, group_size=128)
          (up_proj): WQLinear_GEMM(in_features=5120, out_features=32768, bias=False, w_bit=4, group_size=128)
          (down_proj): WQLinear_GEMM(in_features=32768, out_features=5120, bias=False, w_bit=4, group_size=128)
   

In [13]:
import requests
import json
# Replace with your actual access token
from google.colab import userdata
GRAPH_TOKEN = userdata.get('GRAPH_TOKEN')
access_token = GRAPH_TOKEN

url = "https://graph.microsoft.com/beta/copilot/retrieval"

headers = {
  "Authorization": f"Bearer {access_token}",
  "Content-Type": "application/json"
}

request_body = {
  "queryString": "Please get me information about how many EYI MyDocs workspaces contains document about Netherlands workspace",
  "dataSource": "sharePoint",
  "resourceMetadata": [
    "title",
    "author"
  ],
  "maximumNumberOfResults": "10"
}

response = requests.post(url, headers=headers, data=json.dumps(request_body))

if response.status_code == 200:
  data = response.json()
  print("API Call Successful:")
  print(json.dumps(data, indent=2))
else:
  print(f"API Call Failed with status code: {response.status_code}")
  #print(response.text)

API Call Successful:
{
  "retrievalHits": [
    {
      "webUrl": "https://eygb.sharepoint.com/sites/EYIMyDocsAdoptionToolkit/Shared Documents/General/Business Enablement Community Briefings/Final_EYI MyDocs Global Business Enablement Briefing July 2022.pptx",
      "extracts": [
        {
          "text": "\r\n# EYI MyDocs Governance v2.0\r\n<slide_1>\r\n\r\n# EYI MyDocs Global Enablement Community\r\nBi\\-Monthly Briefing  \r\nJuly 2022\r\n</slide_1>\r\n<slide_2>\r\n\r\n## Agenda\r\n- Welcome\\!\r\n- Adoption \u2013 Sustain \u2013   \r\nFY23 Goals & Objectives  \r\nKnowledge & Learning  \r\nElevating the voice of the business  \r\nCommunity & Communications  \r\nQuality Review\r\n- Adoption \u2013 Extend \u2013   \r\nQuick Updates\r\n- Key Take\\-Aways\r\n</slide_2>\r\n<slide_3>\r\n  \r\n1. Welcome\\!\r\n</slide_3>\r\n<slide_4>\r\n\r\n## EYI MyDocs Global Enablement Community\r\n  \r\nGlobal Product Leadership Team  \r\nExtended Global Leadership Team  \r\nTax Quality Leads  \r\nExe

In [14]:
if response.status_code == 200:
  data = response.json()
  print("API Call Successful:")

  # Rerank the results based on relevance score
  if "retrievalHits" in data:
    reranked_hits = sorted(data["retrievalHits"], key=lambda x: x.get("relevanceScore", 0), reverse=True)
    data["retrievalHits"] = reranked_hits
    print("Results reranked by relevance score.")

  print(json.dumps(data, indent=2))
else:
  print(f"API Call Failed with status code: {response.status_code}")
  #print(response.text)

API Call Successful:
Results reranked by relevance score.
{
  "retrievalHits": [
    {
      "webUrl": "https://eygb.sharepoint.com/sites/EYIMyDocsAdoptionToolkit/Shared Documents/General/Business Enablement Community Briefings/Final_EYI MyDocs Global Business Enablement Briefing July 2022.pptx",
      "extracts": [
        {
          "text": "\r\n# EYI MyDocs Governance v2.0\r\n<slide_1>\r\n\r\n# EYI MyDocs Global Enablement Community\r\nBi\\-Monthly Briefing  \r\nJuly 2022\r\n</slide_1>\r\n<slide_2>\r\n\r\n## Agenda\r\n- Welcome\\!\r\n- Adoption \u2013 Sustain \u2013   \r\nFY23 Goals & Objectives  \r\nKnowledge & Learning  \r\nElevating the voice of the business  \r\nCommunity & Communications  \r\nQuality Review\r\n- Adoption \u2013 Extend \u2013   \r\nQuick Updates\r\n- Key Take\\-Aways\r\n</slide_2>\r\n<slide_3>\r\n  \r\n1. Welcome\\!\r\n</slide_3>\r\n<slide_4>\r\n\r\n## EYI MyDocs Global Enablement Community\r\n  \r\nGlobal Product Leadership Team  \r\nExtended Global Leadership

local_model.to(device) moves all the model's parameters and buffers to the specified device (in this case, device, which is set to 'cuda' if a GPU is available). Deep learning models often have a large number of parameters and require significant computational power. GPUs are designed for parallel processing and can significantly speed up the training and inference of deep learning models. By moving the model to the GPU, you leverage its computational capabilities for faster execution.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
local_model.to(device)

In [None]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("aswinaus/tax_statistics_dataset_by_income_range", download_mode="force_redownload")
df=pd.DataFrame(dataset['train'])
df.head(10)

In [None]:
from transformers import TrainingArguments, Trainer

# 1. Data Preparation (Example - adjust to your specific task)
def preprocess_function(examples):
    # Tokenize income ranges and tax statistics
    inputs = local_tokenizer(examples["income_range"], max_length=128, truncation=True,
                             padding="max_length", return_tensors="pt")
    # ... further processing (e.g., converting tax statistics to numerical labels)
    return inputs

# Apply preprocessing to the DataFrame
df = df.map(preprocess_function)

# 2. Model Adaptation (Fine-tuning example)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Adjust as needed
    # ... other training arguments
)

trainer = Trainer(
    model=local_model,
    args=training_args,
    train_dataset=df["train"],  # Assuming you have a train split
    eval_dataset=df["validation"], # Assuming you have a validation split
    # ... data collator, compute_metrics if needed
)

# 3. Training
trainer.train()

# 4. Evaluation (example)
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# 2. Extract and Format Results
retrieval_text = ""
if "retrievalHits" in data:
    for hit in data["retrievalHits"]:
        if "extracts" in hit:
            for extract in hit["extracts"]:
                retrieval_text += extract.get("text", "") + "\n\n"

print("Extracted text from retrieval hits:")
print(wrap_text(retrieval_text[:500] + "...")) # Print a snippet to avoid flooding the output

In [None]:
# 3. Prepare Prompt
query_string = request_body.get("queryString", "Information") # Get the original query

prompt_template = PromptTemplate.from_template(
    """Use the following information to answer the query:

{retrieval_info}

Query: {query}

Response:"""
)

prompt = prompt_template.format(retrieval_info=retrieval_text, query=query_string)

print("\nGenerated Prompt:")
print(wrap_text(prompt[:500] + "...")) # Print a snippet

In [None]:
# 4. Generate Response
# Configure the pipeline for text generation
pipe = transformers.pipeline(
    "text-generation",
    model=local_model,
    tokenizer=local_tokenizer,
    max_new_tokens=512, # Adjust as needed
    do_sample=True,
    temperature=0.7, # Adjust as needed
    top_p=0.95,     # Adjust as needed
    no_repeat_ngram_size=2,
    return_full_text=False, # Only return the generated text, not the prompt
    pad_token_id=local_tokenizer.eos_token_id # Set pad_token_id to eos_token_id
)

# Create a HuggingFacePipeline object
llm = HuggingFacePipeline(pipeline=pipe)

# Generate the response
response = llm.invoke(prompt)

# 5. Display Response
print("\nGenerated Response:")
print(wrap_text(response))