In [1]:
!pip install --upgrade pip wheel

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1


In [2]:
!pip install gradio flask requests tqdm faiss-cpu transformers torch sentence-transformers textblob gensim numba accelerate ninja

Collecting gradio
  Downloading gradio-5.17.1-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.1 (from gradio)
  Downloading gradio_client-1.7.1-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.28.1 (from gradio)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.

In [3]:
!MAX_JOBS=12 python -m pip -v install flash-attn --no-build-isolation  --use-pep517


Using pip 25.0.1 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)
Collecting flash-attn
  Downloading flash_attn-2.7.4.post1.tar.gz (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Running command Preparing metadata (pyproject.toml)


  torch.__version__  = 2.5.1+cu121


  running dist_info
  creating /tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info
  writing /tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/PKG-INFO
  writing dependency_links to /tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/dependency_links.txt
  writing requirements to /tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/requires.txt
  writing top-level names to /tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/top_level.txt
  writing manifest file '/tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/SOURCES.txt'
  reading manifest file '/tmp/pip-modern-metadata-8txuktjh/flash_attn.egg-info/SOU

In [None]:
import os
import numpy as np
import requests
import faiss
import json
from pathlib import Path
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, logging, AutoModel, pipeline

#-----------------Constants---------------------
CHUNKED_DOCUMENTS_PATH = Path("./chunked_documents.json")
CHUNKED_DOCUMENTS_URL = "https://www.dropbox.com/scl/fi/07wd0zwvz2xcq80hy5f91/chunked_documents.json?rlkey=jwvfpczo4zeyke9j74cdphovi&st=oeqmcfi8&dl=1"
INDEX_PATH = "./faiss_index.idx"
FAISS_INDEX_URL = "https://www.dropbox.com/scl/fi/05ez2886nz5fkkcqsv6hs/faiss_index.idx?rlkey=yil6ollju5smk04upluenqot4&st=yu0oji49&dl=1"
dimension = 384  # Embedding size from MiniLM model


#------------------Load chunks-------------------

if CHUNKED_DOCUMENTS_PATH.exists():
    print("Loading existing chunked_documents.json...")
    with open(CHUNKED_DOCUMENTS_PATH, "r", encoding="utf-8") as f:
        chunked_documents = json.load(f)
else:
    print("chunked_documents.json does not exist. Trying to download from remote URL...")
    response = requests.get(CHUNKED_DOCUMENTS_URL, allow_redirects=True)
    response.raise_for_status()
    with open(CHUNKED_DOCUMENTS_PATH, "wb") as f:
        f.write(response.content)
    print("Successfully downloaded chunked_documents.json from remote URL.")
    with open(CHUNKED_DOCUMENTS_PATH, "r", encoding="utf-8") as f:
        chunked_documents = json.load(f)
print(f"Total document chunks available: {len(chunked_documents)}")

#------------------FAISS------------------------

if os.path.exists(INDEX_PATH):
    print("Loading existing FAISS index from disk...")
    index = faiss.read_index(INDEX_PATH)
    print(f"Total embeddings indexed: {index.ntotal}")
else:
    print("FAISS index does not exist. Trying to download from remote URL...")
    response = requests.get(FAISS_INDEX_URL, allow_redirects=True)
    response.raise_for_status()
    with open(INDEX_PATH, "wb") as f:
        f.write(response.content)
    print("Successfully downloaded FAISS index from remote URL.")
    index = faiss.read_index(INDEX_PATH)
    print(f"Total embeddings indexed: {index.ntotal}")

#-----------------Chatbot-----------------------

logging.set_verbosity_error()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#----------------Retrieval and Generative models----------

retrieval_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
retrieval_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
retrieval_model.cpu().eval()

model_name = "microsoft/Phi-3.5-mini-instruct"
generative_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
generative_model.to(device).eval()
generative_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
chatbot = pipeline("text-generation", model=generative_model, tokenizer=generative_tokenizer, framework="pt")

#-----------------Funcations----------------------

# Function to generate embeddings for a new query
def get_query_embedding(query):
    global retrieval_tokenizer, retrieval_model
    with torch.no_grad():
        inputs = retrieval_tokenizer(query, return_tensors="pt", padding=True, truncation=True)
        outputs = retrieval_model(**inputs)
        embedding = torch.mean(outputs.last_hidden_state, dim=1).detach().cpu().numpy()
    return embedding

# Function to retrieve relevant documents based on the query
def retrieve_documents(query, top_k=4):
    query_embedding = get_query_embedding(query).astype("float32")
    distances, indices = index.search(query_embedding, top_k)
    results = [chunked_documents[idx] for idx in indices[0]]
    return results

# Function to generate a response using retrieved context
def generate_response(history, max_new_tokens=100, temperature=0.7, top_p=0.95):
    generation_args = {
        "max_new_tokens": max_new_tokens,
        "return_full_text": False,
        "temperature": temperature,
        "do_sample": True,
        "pad_token_id": generative_tokenizer.eos_token_id,
        "eos_token_id": generative_tokenizer.eos_token_id,
        "early_stopping": True,
        "use_cache": True,
        "num_return_sequences": 1,
        "num_beams": 1,
        "top_p": top_p
    }

    # Generate response using max_new_tokens to control output length
    output = chatbot(history, **generation_args)
    assistant_reply = output[0]['generated_text'].strip()
    return assistant_reply

# ---------------- Response Function ----------------
def respond(message: str,
            history: list,
            system_message: str,
            max_tokens: int,
            temperature: float,
            top_p: float):
    """
    This function builds a conversation history (a list of dicts)
    and calls the Phi-3.5-mini-instruct pipeline with that history.

    It first ensures that the history starts with the provided system message.
    It then runs sentiment analysis on the new user message.
    If the sentiment is strongly negative, the user message is prefixed with "Angry:".
    The updated history is then passed to the pipeline.

    Generation parameters (max_tokens, temperature, top_p) are passed along.

    Troubleshooting suggestions:
      - If the responses seem hallucinated or off-topic, try adjusting temperature (try higher for more creative, lower for deterministic) or top_p.
      - You can print the history inside this function to verify the conversation structure.
      - Ensure that the input history is a list of dictionaries with "role" and "content" keys.
    """
    # If no history exists, initialize with the system message.
    if history is None or len(history) == 0:
        history = [{"role": "system", "content": system_message}]
    similar_documents = retrieve_documents(message)
    retrieved_text = " ".join(similar_documents)  # Concatenate retrieved documents as context
    input_text = f"User query: {message}\n\nContext:\n{retrieved_text}"
    user_entry = {"role": "user", "content": input_text}

    history.append(user_entry)

    assistant_reply = generate_response(history, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)
    history.append({"role": "assistant", "content": assistant_reply})
    return assistant_reply

# ---------------- Gradio Chat Interface ----------------
# This interface uses additional inputs for system message and generation parameters.
demo = gr.ChatInterface(
    fn=respond,
    type="messages",  # Conversation history is a list of message dictionaries.
    title="Phi-3.5-mini Chatbot",
    description=(
        "A chatbot powered by microsoft/Phi-3.5-mini-instruct. "
        "It uses sentiment analysis to tag angry messages and accepts conversation history as a list of dicts. "
        "Adjust parameters below to test and troubleshoot responses."
    ),
    additional_inputs=[
        gr.Textbox(value="You are a helpful AI assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=250, step=10, label="Max new tokens"),
        gr.Slider(minimum=0.0, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
    ]
)

if __name__ == "__main__":
    demo.launch(debug=True, share=True)


chunked_documents.json does not exist. Trying to download from remote URL...
Successfully downloaded chunked_documents.json from remote URL.
Total document chunks available: 12272
FAISS index does not exist. Trying to download from remote URL...
Successfully downloaded FAISS index from remote URL.
Total embeddings indexed: 12272


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://a9cf51ebcb7675d661.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


