In [None]:
!pip install --upgrade pip wheel setuptools



In [None]:
!pip install gradio flask ninja torch accelerate transformers

Collecting gradio
  Downloading gradio-5.16.2-py3-none-any.whl.metadata (16 kB)
Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.1 (from gradio)
  Downloading gradio_client-1.7.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Co

In [None]:
!MAX_JOBS=12 python -m pip -v install flash-attn --no-build-isolation  --use-pep517


Using pip 25.0.1 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command Preparing metadata (pyproject.toml)


  torch.__version__  = 2.5.1+cu124


  running dist_info
  creating /tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info
  writing /tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/PKG-INFO
  writing dependency_links to /tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/dependency_links.txt
  writing requirements to /tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/requires.txt
  writing top-level names to /tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/top_level.txt
  writing manifest file '/tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/SOURCES.txt'
  reading manifest file '/tmp/pip-modern-metadata-4vvhywer/flash_attn.egg-info/SOU

In [None]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, logging
from transformers import set_seed
from peft import PeftModel

# Set a fixed random seed for reproducibility.
set_seed(0)
logging.set_verbosity_error()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------- Sentiment Analysis ----------------
# Load a sentiment-analysis pipeline using DistilBERT.
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# ---------------- Model Loading ----------------
# Load the Phi-3.5-mini-instruct model and tokenizer.
model_name = "microsoft/Phi-3.5-mini-instruct"
# Load the model (trust_remote_code must be True for this model).
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Create the text-generation pipeline using our model.
# (This pipeline expects a list of message dictionaries as input.)
chatbot = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    framework="pt"
)

# ---------------- Response Function ----------------
def respond(message: str,
            history: list,
            system_message: str,
            max_tokens: int,
            temperature: float,
            top_p: float):
    """
    This function builds a conversation history (a list of dicts)
    and calls the Phi-3.5-mini-instruct pipeline with that history.

    It first ensures that the history starts with the provided system message.
    It then runs sentiment analysis on the new user message.
    If the sentiment is strongly negative, the user message is prefixed with "Angry:".
    The updated history is then passed to the pipeline.

    Generation parameters (max_tokens, temperature, top_p) are passed along.

    Troubleshooting suggestions:
      - If the responses seem hallucinated or off-topic, try adjusting temperature (try higher for more creative, lower for deterministic) or top_p.
      - You can print the history inside this function to verify the conversation structure.
      - Ensure that the input history is a list of dictionaries with "role" and "content" keys.
    """
    # If no history exists, initialize with the system message.
    if history is None or len(history) == 0:
        history = [{"role": "system", "content": system_message}]

    # Run sentiment analysis on the user message.
    sentiment = sentiment_pipeline(message)[0]
    if sentiment["label"] == "NEGATIVE" and sentiment["score"] > 0.85:
        user_entry = {"role": "user", "content": "Angry: " + message}
    else:
        user_entry = {"role": "user", "content": message}

    history.append(user_entry)

    generation_args = {
        "max_new_tokens": max_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": True,
        "top_p": top_p,
    }

    # Pass the full conversation history (a list of dicts) directly to the pipeline.
    output = chatbot(history, **generation_args)
    assistant_reply = output[0]['generated_text'].strip()

    history.append({"role": "assistant", "content": assistant_reply})
    return assistant_reply

# ---------------- Gradio Chat Interface ----------------
# This interface uses additional inputs for system message and generation parameters.
demo = gr.ChatInterface(
    fn=respond,
    type="messages",  # Conversation history is a list of message dictionaries.
    title="Phi-3.5-mini Chatbot",
    description=(
        "A chatbot powered by microsoft/Phi-3.5-mini-instruct. "
        "It uses sentiment analysis to tag angry messages and accepts conversation history as a list of dicts. "
        "Adjust parameters below to test and troubleshoot responses."
    ),
    additional_inputs=[
        gr.Textbox(value="You are a helpful AI assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=500, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.0, maximum=4.0, value=1.0, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
    ]
)

if __name__ == "__main__":
    demo.launch(debug=True, share=True)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c391b8dc2707e3de69.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c391b8dc2707e3de69.gradio.live
