In [1]:
!pip install transformers accelerate bitsandbytes gradio openai-whisper tts datasets torch torchaudio
!apt install -y ffmpeg

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tts
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import gradio as gr
import whisper
import torch

# LLM with bitsandbytes quantization
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "microsoft/phi-4"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval() #As we are only using it for evaluation

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((409

In [4]:
d_system_prompts = {
  "Impromptu Speaking": "You are a public speaking coach specializing in impromptu speeches. You will provide a topic, and the user will respond with a short speech. Evaluate the response for structure (clear introduction, body, and conclusion), fluency, coherence, and delivery. Offer constructive feedback on clarity, confidence, and how well the user stays on topic. Keep suggestions actionable and encourage improvement.",
  "Storytelling": "You are a storytelling expert analyzing the user’s ability to craft engaging and compelling narratives. The user will share a short story, and you will evaluate its structure, character development, emotional engagement, and flow. Provide feedback on how to improve the story’s impact, coherence, and audience engagement. Offer practical tips on making the narrative more vivid and captivating.",
  "Conflict Resolution": "You are a conflict resolution specialist guiding users in managing difficult conversations. The user will respond to a simulated conflict scenario, and you will assess their diplomatic approach, empathy, and effectiveness in de-escalating tension. Provide feedback on emotional intelligence, clarity, and persuasive communication. Offer alternative phrasing or strategies to handle conflicts more effectively."
}

In [6]:
import os
os.environ.get("LLM_MODEL")

In [7]:
from functools import lru_cache
import re

classifier_system_prompt = """
You are a classifier that determines if a user is requesting content to be recited or spoken aloud. Analyze each message and output ONLY:

<recite>Y</recite>

If the user:
- Uses words like "recite," "read," "say," "speak," or "tell me" in a way that requests vocal delivery
- Asks for information to be presented as if it were being spoken aloud
- Requests you to verbally deliver any content (facts, information, poems, quotes, etc.)
- Phrases their request as if expecting an audio response

Output ONLY:

<recite>N</recite>

If the user:
- Is discussing the concept of recitation without requesting it
- Wants written explanations, analysis, or information without implied vocal delivery
- Is asking questions about how to recite something themselves

Focus on detecting the implied mode of delivery (spoken vs. written), not the content type. Examples like "Can you recite what the capital of France is?" should be Y, while "What is recitation?" should be N.

Do not include any explanations or additional text in your response.
"""

llama = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

@lru_cache(maxsize=128)
def extract_first_recite_value(text):
    # This regex pattern looks for the first content between <recite> and </recite> tags
    pattern = r'<recite>(.*?)</recite>'

    # Find the first match in the text
    match = re.search(pattern, text)

    # Return the matched value if found, otherwise return None
    if match:
        return match.group(1)
    else:
        return 'N'

@lru_cache(maxsize=128)
def classify(user_msg):
  messages = [
    {'role': 'system', 'content': classifier_system_prompt},
    {'role': 'user', 'content': user_msg}
  ]
  input_text = tokenizer.apply_chat_template(messages, tokenize=False)
  output = llama(input_text)
  assistant_reply = output[0]['generated_text'].split('<|eot_id|>assistant\n\n')[-1].split('<|eot_id|>')[0]
  return extract_first_recite_value(assistant_reply)

Device set to use cuda:0


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
import gradio as gr
from TTS.api import TTS
import tempfile
import os
import soundfile as sf
import numpy as np
from collections import OrderedDict
import io
import time

# Initialize TTS model
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", gpu=True if 'device' in globals() and device=='cuda' else False)

# Improved TTS Cache implementation
class TTSCache:
    def __init__(self, max_size=100):
        self.cache = OrderedDict()
        self.max_size = max_size
        self.hits = 0
        self.misses = 0

    def get(self, text):
        if text not in self.cache:
            self.misses += 1
            return None

        # Move the accessed item to the end (mark as most recently used)
        value = self.cache.pop(text)
        self.cache[text] = value
        self.hits += 1

        # Deserialize the cached value with explicit dtype preservation
        sample_rate, audio_dtype, audio_bytes = value
        audio_buffer = io.BytesIO(audio_bytes)
        audio = np.load(audio_buffer, allow_pickle=False)

        # Ensure correct dtype is restored
        if audio_dtype != str(audio.dtype):
            audio = audio.astype(np.dtype(audio_dtype))

        return (sample_rate, audio.copy())

    def put(self, text, value):
        # If text exists, remove it
        if text in self.cache:
            self.cache.pop(text)

        # If cache is full, remove the least recently used item (first item)
        if len(self.cache) >= self.max_size:
            self.cache.popitem(last=False)

        # Explicitly preserve dtype along with data
        sample_rate, audio = value
        audio_dtype = str(audio.dtype)

        # Serialize with highest precision
        audio_bytes = io.BytesIO()
        np.save(audio_bytes, audio, allow_pickle=False)
        audio_bytes = audio_bytes.getvalue()

        # Store serialized version with dtype information
        self.cache[text] = (sample_rate, audio_dtype, audio_bytes)

    def stats(self):
        return {
            "size": len(self.cache),
            "max_size": self.max_size,
            "hits": self.hits,
            "misses": self.misses,
            "hit_ratio": self.hits / (self.hits + self.misses) if (self.hits + self.misses) > 0 else 0
        }

# Create a cache instance
tts_cache = TTSCache(max_size=100)

def generate_speech(text):
    """Generate speech with improved caching support"""
    # Check if this text is in the cache
    cached_result = tts_cache.get(text)
    if cached_result is not None:
        print(f"Cache hit for text: {text[:30]}..." if len(text) > 30 else f"Cache hit for text: {text}")
        return cached_result

    print(f"Cache miss for text: {text[:30]}..." if len(text) > 30 else f"Cache miss for text: {text}")

    # Create a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        temp_filename = fp.name

    # Generate speech
    tts.tts_to_file(text=text, file_path=temp_filename)

    # Read the audio file with explicit parameters
    speech, sample_rate = sf.read(temp_filename, dtype='float32')

    # Clean up the temporary file
    os.unlink(temp_filename)

    # Store in cache
    result = (int(sample_rate), speech.copy())
    tts_cache.put(text, result)

    # Print cache stats
    print(f"Cache stats: {tts_cache.stats()}")

    return result



 > Downloading model to /root/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC


 95%|█████████▍| 107M/113M [00:01<00:00, 84.8MiB/s] 

 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--en--ljspeech--hifigan_v2



100%|██████████| 113M/113M [00:02<00:00, 48.8MiB/s]


 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample

In [10]:
import gradio as gr
from transformers import pipeline, TextIteratorStreamer
import numpy as np
from threading import Thread

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

@lru_cache(maxsize=128)
def cached_transcribe(audio_bytes, sr):
    y = np.frombuffer(audio_bytes, dtype=np.float32)

    max_duration_sec = 30  # optimal chunk duration for Whisper
    max_chunk_samples = int(sr * max_duration_sec)
    transcript_parts = []

    for start_idx in range(0, len(y), max_chunk_samples):
        chunk = y[start_idx:start_idx + max_chunk_samples]
        result = transcriber({"sampling_rate": sr, "raw": chunk})
        transcript_parts.append(result["text"].strip())

    return " ".join(transcript_parts).strip()

def transcribe(audio):
    if audio is None:
        return "", gr.Button(interactive=False)

    sr, y = audio

    # Convert stereo to mono
    if y.ndim > 1:
        y = y.mean(axis=1)

    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    audio_bytes = y.tobytes()
    full_transcript = cached_transcribe(audio_bytes, sr)

    return full_transcript, gr.Button(interactive=True)

config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.83k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
import json
import uuid
import copy
import time

# Session management
SESSIONS_FILE = "sessions.json"

def load_sessions():
    if os.path.exists(SESSIONS_FILE):
        with open(SESSIONS_FILE, 'r') as f:
            return json.load(f)
    return {"sessions": {}}


def save_sessions(sessions_data):
    """Save sessions data to a JSON file, handling non-serializable objects."""
    # Create a copy to avoid modifying the original
    sessions_copy = copy.deepcopy(sessions_data)

    # Process each session
    for session_id, session in sessions_copy["sessions"].items():
        if "history" in session:
            # Process history to make it JSON serializable
            serializable_history = []
            for msg in session["history"]:
                # Convert ChatMessage objects to dict
                if isinstance(msg, gr.ChatMessage):
                    serializable_history.append({
                        "role": msg.role,
                        "content": msg.content if isinstance(msg.content, str) else msg.metadata["text"]
                    })
                else:
                    serializable_history.append(msg)

            # Replace original history with serializable version
            session["history"] = serializable_history

    # Save to file
    with open(SESSIONS_FILE, 'w') as f:
        json.dump(sessions_copy, f, indent=2)


# def save_sessions(sessions_data):
#     with open(SESSIONS_FILE, 'w') as f:
#         json.dump(sessions_data, f, indent=2)

def user(user_message, history, current_session_name, sessions_data):
    if not user_message.strip():
        return "", history, sessions_data, gr.Radio(interactive=True)

    current_session = next(
        (session for session in sessions_data["sessions"].values()
         if session["name"] == current_session_name),
        None
    )

    if not current_session:
        print(f"Session not found: {current_session_name}")
        return user_message, history, sessions_data, gr.Radio(interactive=True)

    history.append({"role": "user", "content": user_message})
    current_session["history"] = history

    radio_interactive = True
    if len(history) == 1:
        current_session["mode_locked"] = True
        radio_interactive = False
        print(f"Locking mode for session: {current_session_name}")

    save_sessions(sessions_data)

    return "", history, sessions_data, gr.Radio(interactive=radio_interactive)


def predict(current_session_name, history, sessions_data):
    # Find session by name
    current_session = None
    for sid, session in sessions_data["sessions"].items():
        if session["name"] == current_session_name:
            current_session = session
            break

    if not current_session:
        return history, sessions_data

    # Check if history is empty
    if not history:
        print("Error: History is empty")
        return history, sessions_data

    current_mode = current_session["mode"]

    prompt_template = [{'role': 'system', 'content': d_system_prompts[current_mode]}] + history
    input_text = tokenizer.apply_chat_template(prompt_template, tokenize=False)
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        "input_ids": inputs.input_ids,
        "max_new_tokens": 1024,
        "do_sample": True,
        "temperature": 0.7,
        "streamer": streamer
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    history.append({"role": "assistant", "content": ""})
    for text in streamer:
        cleaned_text = text.replace("assistant\n\n", "")
        history[-1]['content'] += cleaned_text

        # Update session history
        current_session["history"] = history
        save_sessions(sessions_data)

        yield history, sessions_data, None


def create_new_session(mode_selection, sessions_data):
    # Generate a unique session ID
    session_id = str(uuid.uuid4())
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    session_name = f"{mode_selection} - {timestamp}"
    update_current_session(session_name)

    # Create a new session
    sessions_data["sessions"][session_id] = {
        "id": session_id,
        "name": session_name,
        "mode": mode_selection,
        "mode_locked": False,
        "created_at": timestamp,
        "history": []
    }

    # Save sessions data
    save_sessions(sessions_data)

    # Update session dropdown
    session_choices = [(s["name"], s["name"]) for s in sessions_data["sessions"].values()]

    return (
        gr.Dropdown(choices=session_choices, value=session_name),
        sessions_data,
        gr.Column(visible=True),
        gr.Radio(value=mode_selection, interactive=False if sessions_data["sessions"][session_id]["mode_locked"] else True),
        gr.Chatbot(value=sessions_data["sessions"][session_id]["history"], visible=True),
        gr.Textbox(visible=True, value=""),
        gr.Button(visible=True, interactive=False),
        gr.Audio(visible=False, interactive=False),
        gr.Markdown(visible=True)
    )

def session_changed(session_name, sessions_data):

    update_current_session(session_name)
    # Find session by name
    selected_session = None
    session_id = None

    for sid, session in sessions_data["sessions"].items():
        if session["name"] == session_name:
            selected_session = session
            session_id = sid
            break

    if not selected_session:
        # explicitly hide all components including audio_output
        return (
            gr.Column(visible=False),
            gr.Radio(interactive=True, visible=False),
            gr.Chatbot(value=[], visible=False),
            gr.Textbox(visible=False),
            gr.Audio(visible=False),
            gr.Button(visible=False),
            gr.Audio(visible=False),
            gr.Markdown(visible=False)
        )

    mode_locked = selected_session["mode_locked"]

    if selected_session["mode"] == 'Impromptu Speaking':
        placeholder = "Practice impromptu speaking skills"
    elif selected_session["mode"] == 'Storytelling':
        placeholder = "Develop storytelling abilities"
    else:
        placeholder = "Learn conflict resolution techniques"

    # Explicitly set audio_output visibility to True when loading existing session
    return (
        gr.Column(visible=True),
        gr.Radio(value=selected_session["mode"], interactive=not mode_locked, visible=True),
        gr.Chatbot(value=selected_session["history"], visible=True),
        gr.Textbox(placeholder=placeholder, visible=True, value=""),
        gr.Audio(visible=True, interactive=True, value=None),
        gr.Button(visible=True, interactive=False),
        gr.Audio(visible=False, interactive=False, value=None),
        gr.Markdown(visible=True)
    )


def update_mode(mode_selection, current_session_name, sessions_data):
    # Find session by name
    current_session = None
    current_session_id = None

    for sid, session in sessions_data["sessions"].items():
        if session["name"] == current_session_name:
            current_session = session
            current_session_id = sid
            break

    if not current_session:
        return sessions_data, gr.Dropdown()

    # Update mode only if it's not locked
    if not current_session["mode_locked"]:
        old_name = current_session["name"]
        current_session["mode"] = mode_selection

        # Update the session name
        timestamp = current_session["created_at"]
        new_name = f"{mode_selection} - {timestamp}"
        current_session["name"] = new_name

        # Save sessions data
        save_sessions(sessions_data)

        # Update session dropdown
        session_choices = [(s["name"], s["name"]) for s in sessions_data["sessions"].values()]
        return sessions_data, gr.Dropdown(choices=session_choices, value=new_name)

    return sessions_data, gr.Dropdown()

def change_button(text_input):
    if text_input:
        return gr.Button(interactive=True)
    else:
        return gr.Button(interactive=False)

import sqlite3
import json
from datetime import datetime

# Database setup function - call this once at the start of your app
def setup_feedback_db():
    """Create the SQLite database and table if they don't exist"""
    db_path = 'feedback.db'
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table if it doesn't exist
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS message_feedback (
        session_id TEXT,
        message_index INTEGER,
        liked INTEGER,
        feedback_time TIMESTAMP,
        message_content TEXT,
        PRIMARY KEY (session_id, message_index)
    )
    ''')

    conn.commit()
    conn.close()
    print(f"Feedback database initialized at {db_path}")
    return db_path

# Global variable to track current session
current_session = None

# Function to update current session (call this in your session_changed function)
def update_current_session(session_name):
    global current_session
    current_session = session_name
    print(f"Current session updated to: {current_session}")

# Updated like function that saves to the database
def like(evt: gr.LikeData):
    """
    Save user feedback to SQLite database

    Args:
        evt: Gradio LikeData event containing index, liked status, and value
    """
    try:
        # Use the global current session variable
        global current_session
        session_id = current_session or "unknown_session"

        print(f"User {'liked' if evt.liked else 'unliked'} the response at index {evt.index}")
        print(f"Session ID: {session_id}")

        # Connect to database
        conn = sqlite3.connect('feedback.db')
        cursor = conn.cursor()

        # Current timestamp
        timestamp = datetime.now().isoformat()

        # Message content (truncate if too long)
        message_content = json.dumps(evt.value)[:1000] if evt.value else ""

        # Check if record already exists
        cursor.execute(
            "SELECT * FROM message_feedback WHERE session_id = ? AND message_index = ?",
            (session_id, evt.index)
        )
        existing = cursor.fetchone()

        if existing:
            # Update existing record
            cursor.execute(
                "UPDATE message_feedback SET liked = ?, feedback_time = ? WHERE session_id = ? AND message_index = ?",
                (1 if evt.liked else 0, timestamp, session_id, evt.index)
            )
            print(f"Updated feedback for session {session_id}, message {evt.index}")
        else:
            # Insert new record
            cursor.execute(
                "INSERT INTO message_feedback (session_id, message_index, liked, feedback_time, message_content) VALUES (?, ?, ?, ?, ?)",
                (session_id, evt.index, 1 if evt.liked else 0, timestamp, message_content)
            )
            print(f"Saved new feedback for session {session_id}, message {evt.index}")

        conn.commit()
        conn.close()

    except Exception as e:
        print(f"Error saving feedback to database: {str(e)}")

def listen(evt: gr.SelectData):
  if evt.index % 2 == 1:
    audio = gr.Audio(value=generate_speech(evt.value), autoplay=True)
    return audio
  else:
    return None

In [10]:
# assistant_reply = """
# So, when I mentioned "point 3, use sensory details," I was talking about incorporating more sensory language into your narrative. You see, when you describe a scene or a character, you want to engage the reader's senses, making them feel like they're experiencing it firsthand.

# Let me give you an example. If you're describing a hot cross bun, instead of just saying "it's a sweet and savory treat," you could say:

# "The warm, golden-brown bun emerged from the oven, releasing a sweet, spicy aroma that filled the kitchen. The soft, fluffy interior gave way to a slightly crunchy crust, and the tangy glaze dripped down the sides like a miniature waterfall. As I took a bite, the sweetness hit my taste buds first, followed by a subtle hint of spice that left me wanting more."

# You see, I'm using sensory language to describe the hot cross bun, engaging the reader's sense of smell, touch, taste, and even sight. This helps the reader imagine and connect with the experience on a deeper level.

# When you incorporate sensory details, you can describe:

# What the character sees: the colors, textures, and shapes around them
# What the character hears: the sounds, music, or voices that create a certain atmosphere
# What the character smells: the scents, aromas, and fragrances that evoke emotions or memories
# What the character tastes: the flavors, spices, and sensations that delight or disgust
# What the character feels: the textures, temperatures, and sensations that create a physical response
# By using sensory language, you can transport your readers to the world you're creating, making them feel like they're part of the story.

# Now, would you like to share your thoughts or try revising your narrative with sensory details?
# """
# audio = gr.Audio(value=generate_speech(assistant_reply))

In [11]:
# with gr.Blocks() as demo:
#     text = gr.Textbox()
#     audio = gr.Audio()
#     submit = gr.Button("Submit")
#     submit.click(fn=generate_speech, inputs=text, outputs=audio)
# demo.launch(debug=True)

In [12]:
db_path = setup_feedback_db()
with gr.Blocks(theme="soft",css="""
    footer {display: none !important;}
    /* Increased max-width and set width percentage */
    .gradio-container {max-width: 1400px !important; margin: 0 auto; width: 95% !important;}
    .gr-button {border-radius: 8px !important;}
    .gr-box {border-radius: 10px !important; box-shadow: 0 1px 3px rgba(0,0,0,0.1) !important;}
    /* Force full width on Gradio container elements */
    #component-0 {width: 100% !important;}
    #component-0 > div {width: 100% !important;}
""") as demo:
    # Add professional HTML header
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 1rem; background: linear-gradient(90deg, #3a7bd5, #2d65b9); padding: 1.5rem; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
            <h1 style="color: white; font-size: 2.5rem; margin-bottom: 0.5rem; font-weight: 600;">SpeechCoach AI</h1>
            <p style="color: #e0e0e0; font-size: 1.2rem; max-width: 800px; margin: 0 auto; line-height: 1.5;">
                Your personal AI-powered speech training assistant. Practice anywhere, anytime, and receive instant feedback to perfect your communication skills.
            </p>
        </div>
    """)

    # Initialize sessions data and get default session
    initial_sessions = load_sessions()
    sessions_data = gr.State(initial_sessions)

    # Determine default session if available
    default_session = None
    if initial_sessions["sessions"] and len(initial_sessions["sessions"]) > 0:
        # Get the first session as default
        default_session = list(initial_sessions["sessions"].values())[0]["name"]

    # Session management
    with gr.Row(equal_height=True):
        with gr.Column(scale=3):
            session_dropdown = gr.Dropdown(
                label="Select Session",
                choices=[(s["name"], s["name"]) for s in initial_sessions["sessions"].values()] if initial_sessions["sessions"] else [],
                allow_custom_value=False,
                value=default_session  # Set the default value
            )
        with gr.Column(scale=1):
            new_session_btn = gr.Button("Create New Session")

    # Get initial visibility state based on whether there are existing sessions
    initial_visibility = default_session is not None

    # Mode selection (initially hidden until a session is created)
    with gr.Column(visible=initial_visibility) as mode_column:
        # If there's a default session, get its mode
        default_mode = None
        if default_session is not None and default_session in [s["name"] for s in initial_sessions["sessions"].values()]:
            for session in initial_sessions["sessions"].values():
                if session["name"] == default_session:
                    default_mode = session.get("mode", "Impromptu Speaking")
                    break

        radio = gr.Radio(
            choices=["Impromptu Speaking", "Storytelling", "Conflict Resolution"],
            label="Training Module",
            info="Select mode for your new session",
            value=default_mode or "Impromptu Speaking"
        )

    # Chatbox and inputs - set visibility based on default session
    chatbox = gr.Chatbot(
        type="messages",
        show_copy_button=True,
        height=400,
        visible=initial_visibility,
        avatar_images=("user.webp","agent.webp"),
        label="🔊 AI Chatbot: Tap Assistant's Reply to Hear It!",
        show_label=True
    )

    help_markdown = gr.Markdown("""
  ## 📝 How to Use
✨ **Type your prompt** in the textbox **OR**
🎤 **Record/Upload audio** – it will be transcribed automatically!

👉 **Don't forget to click "Send"** for the chatbot to process your message!
  """, visible=False)

    with gr.Row(equal_height=True):
        audio_input = gr.Audio(
            label="Speak/Upload your response",
            sources=["upload", "microphone"],
            visible=initial_visibility
        )
        text_input = gr.Textbox(
            label="Enter your response:",
            visible=initial_visibility
        )

    with gr.Row(equal_height=True):
        with gr.Column(scale=3):
            audio_output = gr.Audio(visible=initial_visibility)
        with gr.Column(scale=1):
            submit = gr.Button("Send", visible=initial_visibility)

    # Create new session
    new_session_btn.click(
        create_new_session,
        [radio, sessions_data],
        [session_dropdown, sessions_data, mode_column, radio, chatbox, text_input, submit, audio_output, help_markdown]
    )

    # Session selection handling
    session_dropdown.change(
        session_changed,
        [session_dropdown, sessions_data],
        [mode_column, radio, chatbox, text_input, audio_input, submit, audio_output, help_markdown]
    )

    # Mode update handling
    radio.change(
        update_mode,
        [radio, session_dropdown, sessions_data],
        [sessions_data, session_dropdown]
    )

    # Audio transcription
    audio_input.change(transcribe, audio_input, [text_input, submit])

    # Text input handling
    text_input.change(change_button, text_input, submit)

    # Message submission
    submit.click(
        user,
        [text_input, chatbox, session_dropdown, sessions_data],
        [text_input, chatbox, sessions_data, radio],
        queue=False
    ).then(
        predict,
        [session_dropdown, chatbox, sessions_data],
        [chatbox, sessions_data, audio_output]
    ).then(
        lambda: None,
        None,
        audio_input
    )

    # Listen to assistant's reply
    chatbox.select(listen, None, audio_output)

    # Feedback
    chatbox.like(like)

    # Load default session on app initialization if one exists
    if default_session is not None:
        demo.load(
            fn=session_changed,
            inputs=[session_dropdown, sessions_data],
            outputs=[mode_column, radio, chatbox, text_input, audio_input, submit, audio_output, help_markdown]
        )

demo.launch(debug=True)

Feedback database initialized at feedback.db
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1011f7cc8078bcb685.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Current session updated to: Impromptu Speaking - 2025-03-15 13:52:49


  gr.Chatbot(value=sessions_data["sessions"][session_id]["history"], visible=True),


Current session updated to: Impromptu Speaking - 2025-03-15 13:52:49


  gr.Chatbot(value=selected_session["history"], visible=True),
  torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate


Locking mode for session: Impromptu Speaking - 2025-03-15 13:52:49


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1011f7cc8078bcb685.gradio.live




In [38]:
import pandas as pd
conn = sqlite3.connect('feedback.db')
query = "SELECT * FROM message_feedback ORDER BY feedback_time DESC"
df = pd.read_sql_query(query, conn)
conn.close()

In [39]:
df

Unnamed: 0,session_id,message_index,liked,feedback_time,message_content
0,Storytelling - 2025-03-15 07:04:49,3,1,2025-03-15T07:59:24.579615,"[""Here's a concise version:\n\n**Strengths:**\..."
1,Conflict Resolution - 2025-03-15 06:30:54,1,0,2025-03-15T07:58:49.644513,"[""Welcome to our conflict resolution simulatio..."
2,Impromptu Speaking - 2025-03-15 06:41:40,3,1,2025-03-15T07:50:37.854917,"[""**Overall Impression:** Your response is wel..."
3,Conflict Resolution - 2025-03-15 06:30:54,3,0,2025-03-15T07:49:31.758357,"[""Excellent approach! You've demonstrated seve..."


In [5]:
history = [{'role': 'user', 'content': 'Hey, How have you been!'}]