# Install Libraries

In [None]:
!pip install transformers datasets peft bitsandbytes sentence-transformers faiss-cpu unsloth langchain-community langchain-huggingface

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting unsloth
  Downloading unsloth-2025.4.7-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.

In [None]:
!pip install -U "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -U transformers datasets accelerate peft triton

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-g1vnfspk/unsloth_b2add494879b4a56a7328fec0b3be3ff
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-g1vnfspk/unsloth_b2add494879b4a56a7328fec0b3be3ff
  Resolved https://github.com/unslothai/unsloth.git to commit 9390bd528d4126840b142d5c354b8c1d7461f41e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.4.8-py3-none-any.whl size=263211 sha256=de9c0197941ef53b4ecfc70a64e164032845295fd6d90a5d82426b870c33de0f
  Stored in directory: /tmp/pip-ep

In [None]:
# Add before any training code
import torch
import triton

# Disable some problematic Triton features
torch._dynamo.config.suppress_errors = True
triton.Config.allow_fp8 = False

In [None]:
!pip uninstall -y triton
!pip install triton==2.2.0  # Known stable version

Found existing installation: triton 3.3.0
Uninstalling triton-3.3.0:
  Successfully uninstalled triton-3.3.0
Collecting triton==2.2.0
  Downloading triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading triton-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.9/167.9 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.7.0 requires triton==3.3.0; platform_system == "Linux" and platform_machine == "x86_64", but you have triton 2.2.0 which is incompatible.
unsloth-zoo 2025.4.4 requires triton>=3.0.0; platform_system == "Linux", but you have triton 2.2.0 which is incompatible.
torchaudio 2.6.0+cu124 requires torch==2.6.0, but you h

In [None]:
# Add at start of script
import os
os.environ["TRITON_DEBUG"] = "1"
os.environ["TRITON_DISABLE_LINE_INFO"] = "1"

# Importing Libraries

In [None]:
from unsloth import FastLanguageModel
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Loading Llama - 3 - 8B model for underestanding Emoji contexts and finetuning it on personal Data

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Llama-3.2-3B-Instruct",
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=4096,
    load_in_4bit=True
 )

In [None]:
model = FastLanguageModel.get_peft_model(
    model, r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [None]:
tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

In [None]:
dataset = load_dataset("json", data_files="/content/pg_chat_combined.jsonl", split="train")

In [None]:
dataset = standardize_sharegpt(dataset)

In [None]:
dataset = dataset.map(
    lambda examples: {
        "text": [
            tokenizer.apply_chat_template(convo, tokenize=False)
            for convo in examples["messages"]
        ]
    },
    batched=True
)

# Trainer - 5 Epochs

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 4096,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=50,
        learning_rate = 2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
from google.colab import userdata

# Pushed into Hugging Face repo

In [None]:
model.push_to_hub(
    "MPTarun/llama_aac_model_8b_test",
    tokenizer,
    token = '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'
)

# Loading from Hugging Face Repo

In [None]:
from unsloth import FastLanguageModel
inference_model, inference_tokenizer = FastLanguageModel.from_pretrained(
    model_name="MPTarun/llama_aac_model_8b_test",
    max_seq_length=4096,
    load_in_4bit=True
)

==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 2.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Unsloth 2025.4.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Using Alpaca promt template

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


# Testing Finetuned model (8B)

In [None]:
FastLanguageModel.for_inference(inference_model)
inputs = inference_tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.",
        "1, 1, 2, 3, 5, 8",
        "",
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(inference_tokenizer)
_ = inference_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)
raw = inference_tokenizer.decode(_[0], skip_special_tokens=True)
# print(raw)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
13<|end_of_text|>


# Loading Emoji dataset

In [None]:
import pandas as pd

df = pd.read_csv('/content/full_emoji.csv')
print(df.head())

  emoji                             name
0     😀                    grinning face
1     😃      grinning face with big eyes
2     😄  grinning face with smiling eyes
3     😁   beaming face with smiling eyes
4     😆          grinning squinting face


# Function to get Emoji's meaning

In [None]:
def get_name_filter(df, emoji):
    match = df.loc[df['emoji'] == emoji, 'name']
    return match.iloc[0] if not match.empty else None


emoji_to_name = dict(zip(df['emoji'], df['name']))
def get_name_dict(emoji):
    return emoji_to_name.get(emoji)
print(get_name_dict('🍟'))

french fries


In [None]:
print(get_name_dict('👎'))

thumbs down


#Few examples with Emojis

In [None]:
FastLanguageModel.for_inference(inference_model)  # Enable native 2× faster inference

# 1) Tokenize as before, using keyword args to fill your template:
emoji   = '🌧️'
meaning = get_name_dict(emoji) or "unknown"

instruction = f"Use this emoji as context and answer: {emoji} meaning '{meaning}'. And keep the chat conversational"
inputs = inference_tokenizer(
    [alpaca_prompt.format(
        instruction,
        "How is the weather outside?",
        ""
    )],
    return_tensors="pt"
).to("cuda")

output_ids = inference_model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=False,
    eos_token_id=inference_tokenizer.eos_token_id,
    pad_token_id=inference_tokenizer.pad_token_id,
)

# 3) Decode the full raw output
raw = inference_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 4) Extract only the first “### Response:” block
if "### Response:" in raw:
    after = raw.split("### Response:", 1)[1]
    answer = after.split("### Instruction:", 1)[0] \
                  .split("### Input:", 1)[0] \
                  .strip()
else:
    answer = raw.strip()

print("Response:", answer)


Response: It's raining! Nice and cozy inside, though. :)


In [None]:
FastLanguageModel.for_inference(inference_model)  # Enable native 2× faster inference

# 1) Tokenize as before, using keyword args to fill your template:
emoji   = '🏋️‍♂️'
meaning = get_name_dict(emoji) or "unknown"

instruction = f"Use this emoji as context and answer: {emoji} meaning '{meaning}'. And keep the chat conversational"
inputs = inference_tokenizer(
    [alpaca_prompt.format(
        instruction,  # instruction
        "Where are you going this evening?",                  # input
        ""                                   # output placeholder
    )],
    return_tensors="pt"
).to("cuda")
# 2) Generate into a tensor (no streaming)
output_ids = inference_model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=False,
    eos_token_id=inference_tokenizer.eos_token_id,
    pad_token_id=inference_tokenizer.pad_token_id,
)

# 3) Decode the full raw output
raw = inference_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 4) Extract only the first “### Response:” block
if "### Response:" in raw:
    after = raw.split("### Response:", 1)[1]
    answer = after.split("### Instruction:", 1)[0] \
                  .split("### Input:", 1)[0] \
                  .strip()
else:
    answer = raw.strip()

print("Response:", answer)


Response: I'm going to the gym for a workout. Any plans for the night?


In [None]:
FastLanguageModel.for_inference(inference_model)  # Enable native 2× faster inference

# 1) Tokenize as before, using keyword args to fill your template:
emoji   = '🌮'
meaning = get_name_dict(emoji) or "unknown"

instruction = f"Use this emoji as context and answer: {emoji} meaning '{meaning}'. And keep the chat conversational"
inputs = inference_tokenizer(
    [alpaca_prompt.format(
        instruction,  # instruction
        "What do you like to eat?",                  # input
        ""                                   # output placeholder
    )],
    return_tensors="pt"
).to("cuda")
# 2) Generate into a tensor (no streaming)
output_ids = inference_model.generate(
    **inputs,
    max_new_tokens=64,
    do_sample=False,
    eos_token_id=inference_tokenizer.eos_token_id,
    pad_token_id=inference_tokenizer.pad_token_id,
)

# 3) Decode the full raw output
raw = inference_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 4) Extract only the first “### Response:” block
if "### Response:" in raw:
    after = raw.split("### Response:", 1)[1]
    answer = after.split("### Instruction:", 1)[0] \
                  .split("### Input:", 1)[0] \
                  .strip()
else:
    answer = raw.strip()

print("Response:", answer)


Response: I love tacos! They're so delicious and easy to make. What's your go-to taco recipe?


# Creating a Function to call in the UI

In [None]:
FastLanguageModel.for_inference(inference_model)  # Enable native 2× faster inference

def emoji_suggestion(emoji, question):
  # 1) Tokenize as before, using keyword args to fill your template:
  # emoji   = '🏋️‍♂️'
  meaning = get_name_dict(emoji) or "unknown"

  instruction = f"Use this emoji as context and answer: {emoji} meaning '{meaning}'. Understand the sentiment and answer yes or no questions, if the sentiment is negative clearly give negative response. And keep the chat conversational and don't add anymore data that isn't explicitly asked in the question"
  inputs = inference_tokenizer(
      [alpaca_prompt.format(
          instruction,  # instruction
          question,                  # input
          ""                                   # output placeholder
      )],
      return_tensors="pt"
  ).to("cuda")
  # 2) Generate into a tensor (no streaming)
  output_ids = inference_model.generate(
      **inputs,
      max_new_tokens=64,
      do_sample=True,
      eos_token_id=inference_tokenizer.eos_token_id,
      pad_token_id=inference_tokenizer.pad_token_id,
  )

  # 3) Decode the full raw output
  raw = inference_tokenizer.decode(output_ids[0], skip_special_tokens=True)

  # 4) Extract only the first “### Response:” block
  if "### Response:" in raw:
      after = raw.split("### Response:", 1)[1]
      answer = after.split("### Instruction:", 1)[0] \
                    .split("### Input:", 1)[0] \
                    .strip()
  else:
      answer = raw.strip()

  print("Response:", answer)


# Examples of Function calls

In [None]:
emoji_suggestion(emoji='🏋️‍♂️', question="Where are you going this evening?")

Response: I'm going to the gym for a workout. Where are you going?


In [None]:
emoji_suggestion(emoji='🇦🇺', question="where did you go on a trip?")

Response: I went to Australia on a trip. It was amazing! I highly recommend it to anyone who's thinking about going.


In [None]:
emoji_suggestion(emoji='👎', question="Did you like the movie you watched last week?")

Response: I didn't like it. It was boring and predictable. I prefer movies with interesting stories and surprising twists.


In [None]:
emoji_suggestion(emoji='🕔', question="what is the time now?")

Response: It's 5:00 PM in Buffalo, and I'm ready for a relaxing evening. Any plans for the rest of the day?


# Creating Complete chain

In [None]:
from unsloth import FastLanguageModel
from pydantic import BaseModel
from typing import Any, List, Optional
import torch

from langchain.llms.base import LLM
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

# 1) Load the merged 4-bit model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "MPTarun/llama_aac_model",
    max_seq_length = 4096,
    dtype          = None,
    load_in_4bit   = True,
)

# 2) Define a LangChain-compatible LLM wrapper
class FastLLM(LLM, BaseModel):
    model: Any
    tokenizer: Any
    max_seq_length: int = 4096
    _identifying_params: dict = {}

    class Config:
        arbitrary_types_allowed = True

    def __init__(self, **data: Any):
        super().__init__(**data)
        self._identifying_params = {
            "model_name": getattr(self.model.config, "name_or_path", None)
        }

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None
    ) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        input_len = inputs.input_ids.shape[-1]

        out = self.model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            repetition_penalty=1.2,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.pad_token_id,
        )

        new_ids = out[0, input_len:]
        text = self.tokenizer.decode(new_ids, skip_special_tokens=True)

        if "User:" in text:
            text = text.split("User:")[0].strip()
        if "user:" in text:
            text = text.split("user:")[0].strip()

        if "assistant:" in text:
            text = text.split("assistant:")[0].strip()
        if "human:" in text:
            text = text.split("human:")[0].strip()


        if "Human:" in text:
            text = text.split("Human:")[0].strip()
        if "AI:" in text:
            text = text.split("AI:")[0].strip()

        return text

    @property
    def _llm_type(self) -> str:
        return "fast"

# 3) Instantiate the wrapper
fast_llm1 = FastLLM(model=model, tokenizer=tokenizer)
fast_llm2 = FastLLM(model=model, tokenizer=tokenizer)
fast_llm3 = FastLLM(model=model, tokenizer=tokenizer)

# 4) Set up a prompt template
prompt = PromptTemplate.from_template(
    "The following is a conversation between a friend and Tarun.\n\n"
    "{history}\n"
    "User: {input}\n"
    "Assistant:"
)

# 5) Create a conversational memory buffer (last 20 messages)
memory1 = ConversationBufferMemory(
    memory_key="history",
    return_messages=False,
    k=10
)
memory2 = ConversationBufferMemory(
    memory_key="history",
    return_messages=False,
    k=10
)
memory3 = ConversationBufferMemory(
    memory_key="history",
    return_messages=False,
    k=10
)

# 6) Build a ConversationChain that uses that buffer
conv_chain1 = ConversationChain(
    llm=fast_llm1,
    prompt=prompt,
    memory=memory1,
    verbose=True
)
conv_chain2 = ConversationChain(
    llm=fast_llm2,
    prompt=prompt,
    memory=memory2,
    verbose=True
)
conv_chain3 = ConversationChain(
    llm=fast_llm3,
    prompt=prompt,
    memory=memory3,
    verbose=True
)

# 7) Example conversation
response = conv_chain1.predict(input="Hi, I am Charan.")


==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 2.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Unsloth 2025.4.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
  memory1 = ConversationBufferMemory(
  conv_chain1 = ConversationChain(




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a conversation between a friend and Tarun.


User: Hi, I am Charan.
Assistant:[0m

[1m> Finished chain.[0m


In [None]:
print(response)

 Nice to meet you. How are you?


In [None]:
print(conv_chain.predict(input="Nice to meet you—how are you today?"))



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a conversation between a friend and Tarun.

Human: Hi, I am Charan.
AI:  Nice to meet you! Do you come from a big family?
User: Nice to meet you—how are you today?
Assistant:[0m

[1m> Finished chain.[0m
I'm doing great, thank you for asking. And yes, one younger brother, Varun and elder brother Ayush. Growing up with them was always fun.


In [None]:
print(conv_chain.predict(input="Which university are you studying in?"))



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a conversation between a friend and Tarun.

Human: Hi, I am Charan.
AI:  Nice to meet you! Do you come from a big family?
Human: Nice to meet you—how are you today?
AI: I'm doing great, thank you for asking. And yes, one younger brother, Varun and elder brother Ayush. Growing up with them was always fun.
User: Which university are you studying in?
Assistant:[0m

[1m> Finished chain.[0m
 That's the University at Buffalo, where we have one of the best AI programs in the country. I'm majoring in Computer Science, with a focus on Artificial Intelligence.


In [None]:
print(conv_chain.predict(input="How are your studies going?"))



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a conversation between a friend and Tarun.

Human: Hi, I am Charan.
AI:  Nice to meet you! Do you come from a big family?
Human: Nice to meet you—how are you today?
AI: I'm doing great, thank you for asking. And yes, one younger brother, Varun and elder brother Ayush. Growing up with them was always fun.
Human: Which university are you studying in?
AI:  That's the University at Buffalo, where we have one of the best AI programs in the country. I'm majoring in Computer Science, with a focus on Artificial Intelligence.
User: How are your studies going?
Assistant:[0m

[1m> Finished chain.[0m
Great! I wrapped up my first semester with a 3.83 GPA. The second one is in full swing, but I love the challenge. One more semester to go!


In [None]:
print(conv_chain.predict(input="What is my name again?"))



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a conversation between a friend and Tarun.

Human: Hi, I am Charan.
AI:  Nice to meet you! Do you come from a big family?
Human: Nice to meet you—how are you today?
AI: I'm doing great, thank you for asking. And yes, one younger brother, Varun and elder brother Ayush. Growing up with them was always fun.
Human: Which university are you studying in?
AI:  That's the University at Buffalo, where we have one of the best AI programs in the country. I'm majoring in Computer Science, with a focus on Artificial Intelligence.
Human: How are your studies going?
AI: Great! I wrapped up my first semester with a 3.83 GPA. The second one is in full swing, but I love the challenge. One more semester to go!
User: What is my name again?
Assistant:[0m

[1m> Finished chain.[0m
Ah, it's Charan! A beautiful name. (Smiling)


# UI - Gradio (Finetuned Model + RAG + Emoji based context input + Sentence Completion)

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [None]:
# !pip install transformers datasets peft bitsandbytes sentence-transformers faiss-cpu unsloth gradio

import os, json, torch, gradio as gr
from unsloth import FastLanguageModel

# ===== Load Model =====
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="MPTarun/llama_aac_model",
    max_seq_length=4096,
    load_in_4bit=True,
)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"

# ===== Chat History =====
chat_log_path = "chat_log.json"
chat_log = []
model_log = []  # Clean version for model prompt building

# ===== Helper Functions =====
def format_chat(history):
    return "<br><br>".join(history[-20:])

def generate(prompt, max_new_tokens=60):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.inference_mode():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

def handle_tarun_input(tarun_text, history):
    global model_log
    if not tarun_text.strip():
        return history, "", "", "", format_chat(history)

    history.append(f"<b>Tarun:</b> {tarun_text}")
    model_log.append(f"User: {tarun_text}")

    prompt = "\n".join(model_log[-6:]) + "\nAssistant:"

    def clean_response(text):
        if "Assistant:" in text:
            text = text.split("Assistant:")[-1]
        return text.split("User:")[0].strip()

    replies = [clean_response(generate(prompt)) for _ in range(3)]

    model_log.append(f"Assistant: {replies[0]}")  # Only add 1st response to context

    return history, replies[0], replies[1], replies[2], format_chat(history)

def handle_user_reply(reply, history):
    global model_log
    if not reply.strip():
        return history, format_chat(history)
    history.append(f"<b>You:</b> {reply}")
    model_log.append(f"Assistant: {reply}")
    with open(chat_log_path, "w") as f:
        json.dump(history, f)
    return history, format_chat(history)

def autocomplete_next_words(partial_text, history):
    if not partial_text.strip():
        return ["", "", "", ""]

    context = " ".join([
        line.replace("<b>Tarun:</b>", "User:").replace("<b>You:</b>", "Assistant:")
        for line in history[-5:]
    ])
    prompt = f"{context}\nUser: {partial_text}"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        num_return_sequences=4,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id
    )

    suggestions = []
    for o in outputs:
        suggestion = tokenizer.decode(o, skip_special_tokens=True).strip()
        if suggestion.startswith(partial_text):
            suggestion = suggestion[len(partial_text):].strip()
        if suggestion:
            suggestions.append(suggestion.split('User: ')[-1].split('<User')[0])

    return (suggestions + ["", "", "", ""])[:4]

def apply_suggestion(suggestion, current_input):
    return current_input.strip() + " " + suggestion

# ===== Gradio Interface =====
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Conversational AAC Assistant")
    state = gr.State(chat_log)

    with gr.Row():
        tarun_input = gr.Textbox(label="Charan says...")
        send_btn = gr.Button("Send")

    chat_display = gr.HTML(format_chat(chat_log))

    with gr.Row():
        reply1 = gr.Button()
        reply2 = gr.Button()
        reply3 = gr.Button()

    with gr.Row():
        user_input = gr.Textbox(label="Or type your own...", interactive=True)

    with gr.Row():
        sugg1 = gr.Button("", visible=True)
        sugg2 = gr.Button("", visible=True)
        sugg3 = gr.Button("", visible=True)
        sugg4 = gr.Button("", visible=True)

    submit_btn = gr.Button("Submit Reply")

    # Button events
    send_btn.click(handle_tarun_input,
                   [tarun_input, state],
                   [state, reply1, reply2, reply3, chat_display])

    for btn in [reply1, reply2, reply3]:
        btn.click(handle_user_reply, [btn, state], [state, chat_display])

    submit_btn.click(handle_user_reply, [user_input, state], [state, chat_display])

    user_input.change(fn=autocomplete_next_words,
                      inputs=[user_input, state],
                      outputs=[sugg1, sugg2, sugg3, sugg4])

    for sugg_btn in [sugg1, sugg2, sugg3, sugg4]:
        sugg_btn.click(fn=apply_suggestion,
                       inputs=[sugg_btn, user_input],
                       outputs=[user_input])

demo.launch(share=True)

==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 2.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8227094480ce89eedd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# References:



*   https://www.kaggle.com/code/danielhanchen/kaggle-llama-3-2-1b-3b-unsloth-notebook
*   https://github.com/vossenwout/llm-finetuning-resources
*   https://www.kaggle.com/datasets/subinium/emojiimage-dataset
*   https://www.gradio.app/guides/creating-a-chatbot-fast



