<a href="https://colab.research.google.com/github/abhirup84/genaipoc/blob/main/Transcript_Refactoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
#drive.mount('/content/drive')
#!ffmpeg -i '/content/drive/MyDrive/audio_for_poc/AUD-20250718-WA0004.mp3' -acodec pcm_s16le -ar 16000 -ac 1 'clean1.wav'
!ffmpeg -i 'AUD-20250718-WA0004.mp3' -acodec pcm_s16le -ar 16000 -ac 1 'clean1.wav'

In [None]:
!pip install openai-Whisper
!pip install speechbrain

Collecting openai-Whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m27.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-Whisper
  Building wheel for openai-Whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-Whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=e6bcaedf5f481a71e5f53775c1ef37546c85f728979a533fd316a5187cbd7602
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b

In [None]:
# diarize_cpu_optimized.py
!pip install openai-Whisper

import time
import warnings
import numpy as np
import torch
import torchaudio
from pydub import AudioSegment
import whisper
from speechbrain.inference.speaker import EncoderClassifier
from sklearn.cluster import KMeans

# --------------------------
# Suppress warnings
# --------------------------
warnings.filterwarnings("ignore")
torch.set_num_threads(2)  # Limit threads to reduce CPU overhead

# --------------------------
# Initialize models
# --------------------------
print("Loading models...")
t0 = time.time()
whisper_model = whisper.load_model("small")  # CPU-friendly
encoder = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir")
print(f"[Init] Models loaded in {time.time()-t0:.2f} sec\n")

ROLES = ["Executive", "Customer"]

# --------------------------
# Helpers
# --------------------------
def audiosegment_to_tensor(aud_seg: AudioSegment):
    arr = np.array(aud_seg.get_array_of_samples()).astype(np.float32)
    if aud_seg.channels == 2:
        arr = arr.reshape((-1, 2)).mean(axis=1)
    tensor = torch.from_numpy(arr).unsqueeze(0) / (1 << (8 * aud_seg.sample_width - 1))
    return tensor

def transcribe_chunk(chunk):
    tensor = audiosegment_to_tensor(chunk)
    audio_np = tensor.squeeze(0).numpy()
    result = whisper_model.transcribe(audio_np, fp16=False)
    segments = []
    for seg in result.get("segments", []):
        segments.append({
            "start": seg["start"],
            "end": seg["end"],
            "text": seg["text"].strip()
        })
    return segments

def extract_embeddings(segments, audio):
    embeddings = []
    valid_indices = []
    for idx, seg in enumerate(segments):
        start_ms, end_ms = int(seg["start"]*1000), int(seg["end"]*1000)
        seg_audio = audio[start_ms:end_ms]
        if len(seg_audio) < 500:  # Skip too short segments
            continue
        tensor = audiosegment_to_tensor(seg_audio)
        if seg_audio.frame_rate != 16000:
            resampler = torchaudio.transforms.Resample(seg_audio.frame_rate, 16000)
            tensor = resampler(tensor)
        with torch.no_grad():
            emb = encoder.encode_batch(tensor)
        emb_np = emb.squeeze(0).cpu().numpy()
        if emb_np.ndim == 2:
            emb_np = emb_np.mean(axis=0)
        embeddings.append(emb_np)
        valid_indices.append(idx)
    return np.vstack(embeddings) if embeddings else np.zeros((0, 192)), valid_indices

def assign_speakers(segments, embeddings, valid_indices):
    if embeddings.shape[0] < 2:
        labels = np.zeros(len(valid_indices), dtype=int)
    else:
        labels = KMeans(n_clusters=2, random_state=42).fit_predict(embeddings)
    first_idx = valid_indices[0]
    exec_label = labels[0]
    full_labels = [-1]*len(segments)
    for lbl, idx in zip(labels, valid_indices):
        full_labels[idx] = 0 if lbl==exec_label else 1
    # Propagate labels
    for i, lbl in enumerate(full_labels):
        if lbl==-1:
            nearest = min(valid_indices, key=lambda x: abs(x-i))
            full_labels[i] = full_labels[nearest]
    transcript = []
    for seg, lbl in zip(segments, full_labels):
        transcript.append({
            "start": seg["start"],
            "end": seg["end"],
            "speaker": ROLES[lbl],
            "text": seg["text"]
        })
    return transcript

def merge_adjacent(transcript, threshold=0.25):
    merged = []
    prev = None
    for seg in transcript:
        if prev and prev["speaker"]==seg["speaker"] and seg["start"] <= prev["end"]+threshold:
            prev["end"] = seg["end"]
            prev["text"] += " " + seg["text"]
        else:
            if prev: merged.append(prev)
            prev = seg.copy()
    if prev: merged.append(prev)
    return merged

# --------------------------
# Main diarization
# --------------------------
def diarize(audio_file):
    times = {}
    start_total = time.time()

    # Load audio
    t0 = time.time()
    audio = AudioSegment.from_file(audio_file)
    times["load_audio"] = time.time()-t0
    print(f"[Step 1] Load audio: {times['load_audio']:.2f} sec")

    # Split audio into sequential chunks
    t0 = time.time()
    chunk_ms = 20000  # 20 sec chunks
    chunks = [audio[i:i+chunk_ms] for i in range(0, len(audio), chunk_ms)]
    times["split_chunks"] = time.time()-t0
    print(f"[Step 2] Split into {len(chunks)} chunks: {times['split_chunks']:.2f} sec")

    # Transcribe
    t0 = time.time()
    segments = []
    offset = 0.0
    for chunk in chunks:
        segs = transcribe_chunk(chunk)
        for seg in segs:
            seg["start"] += offset
            seg["end"] += offset
        segments.extend(segs)
        offset += len(chunk)/1000.0
    times["transcribe"] = time.time()-t0
    print(f"[Step 3] Whisper transcription: {times['transcribe']:.2f} sec ({len(segments)} segments)")

    # Speaker embeddings
    t0 = time.time()
    embeddings, valid_indices = extract_embeddings(segments, audio)
    times["embeddings"] = time.time()-t0
    print(f"[Step 4] Speaker embeddings: {times['embeddings']:.2f} sec (valid {len(valid_indices)})")

    # Assign speakers & merge
    t0 = time.time()
    transcript = assign_speakers(segments, embeddings, valid_indices)
    merged = merge_adjacent(transcript)
    times["merge"] = time.time()-t0
    print(f"[Step 5] Merge transcript: {times['merge']:.2f} sec")

    times["total"] = time.time()-start_total
    print(f"[Total] Finished in {times['total']:.2f} sec\n")
    return merged, times

# --------------------------
# Run
# --------------------------
if __name__ == "__main__":
    audio_file = "clean1.wav"
    transcript, step_times = diarize(audio_file)

    with open("output_cpu.txt", "w", encoding="utf-8") as f:
        for seg in transcript:
            f.write(f"{seg['speaker']}: {seg['text']}\n")

    print("✅ Transcript saved to output_cpu.txt")
    print("⏱ Step times:", step_times)



  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):
  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


Loading models...


100%|███████████████████████████████████████| 461M/461M [00:26<00:00, 18.0MiB/s]
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml' -> '/content/tmpdir/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmpdir.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ec

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt' -> '/content/tmpdir/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmpdir/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt' -> '/content/tmpdir/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmpdir/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt' -> '/content/tmpdir/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/tmpdir/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt' -> '/content/tmpdir/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /content/tmpdir/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /content/tmpdir/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /content/tmpdir/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /content/tmpdir/classifier.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local

[Init] Models loaded in 34.40 sec

[Step 1] Load audio: 0.00 sec
[Step 2] Split into 10 chunks: 0.00 sec
[Step 3] Whisper transcription: 18.89 sec (41 segments)
[Step 4] Speaker embeddings: 42.65 sec (valid 41)
[Step 5] Merge transcript: 0.04 sec
[Total] Finished in 61.59 sec

✅ Transcript saved to output_cpu.txt
⏱ Step times: {'load_audio': 0.0032873153686523438, 'split_chunks': 0.001203298568725586, 'transcribe': 18.889604568481445, 'embeddings': 42.64998149871826, 'merge': 0.04393649101257324, 'total': 61.588536500930786}


Replace `your_audio.mp3` with the name of your input MP3 file and `your_audio.wav` with the desired name for the output WAV file.

* `-i your_audio.mp3`: specifies the input file.
* `-acodec pcm_s16le`: specifies the audio codec for the output WAV file. `pcm_s16le` is a common format for uncompressed audio.
* `-ar 16000`: sets the audio sample rate to 16000 Hz.
* `-ac 1`: sets the number of audio channels to 1 (mono).

In [None]:
!pip install transformers
!pip install accelerate
!pip install -U bitsandbytes
# If you encounter a PackageNotFoundError for bitsandbytes after running this cell,
# try restarting the Colab runtime (Runtime > Restart runtime) and then run the cells again.

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [None]:
import torch
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load the transcript from the file
with open("output_cpu.txt", "r", encoding="utf-8") as file:
    transcript = file.read()

warnings.filterwarnings("ignore")

# Configure quantization for efficient loading
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config)

# Define a function to query the transcript with different analytical questions using Llama
def analyze_transcript_llama(query):
    prompt = f"""You are an expert call analyst. Analyze the transcript and answer the user's query.

Transcript:
{transcript}

Query: {query}

Answer:
"""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.3
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the answer part
    answer_start_tag = "Answer:\n"
    answer_start_index = response.find(answer_start_tag)
    if answer_start_index != -1:
        response = response[answer_start_index + len(answer_start_tag):].strip()
    else:
        response = response.strip() # If tag not found, return the whole response

    return response

queries = [
    "Did Executive Uses appropriate greeting. Also if yes set score =2 else =0.",
    "Did Executive Obtains and verifies customer contact information. Also if yes set score =1 else =0.",
    "Did Executive Understood and clarified the issue raised by Customer. Also if yes set score =4 else =0.",
    "Did Executive Actively listen and avoided the interruption in call. Also if yes set score =3 else =0.",
    "Did Executive used proper grammar & terms. Also if yes set score =4 else =0.",
    "Did Executive speak clearly and audibly. Also if yes set score =1 else =0.",
    "Was the Executive enthusiastic in the call. Also if yes set score =3 else =0.",
    "Was the Executive empathetic to the Customer in the call. Also if yes set score =4 else =0.",
    "Did Executive take ownership of call. Also if yes set score =1 else =0.",
    "Was the call duration suitable? Also if yes set score =2 else =0.",
    "Did the Customer confirm the resolution or expressed satisfaction. Also if yes set score =2 else =0.",
    "Did Executive appropriately use of hold/Dead air. Also if yes set score =1 else =0.",
    "Did Executive check/triage and proper probing. Also if yes set score =3 else =0.",
    "Did Executive follow correct procedure. Also if yes set score =1 else =0.",
    "Did Executive share the reference number of the call. Also if yes set score =2 else =0.",
    "Did Executive offer Additional help before closing the call. Also if yes set score =2 else =0."
]

# Run analysis for each query using Llama
for q in queries:
    print(f"\nQuery: {q}")
    print("Response:", analyze_transcript_llama(q))

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Query: Did Executive Uses appropriate greeting. Also if yes set score =2 else =0.
Response: Yes, Executive used appropriate greeting.

Query: Did Executive Obtains and verifies customer contact information. Also if yes set score =1 else =0.
Response: Yes, Executive obtained and verified customer contact information. The score for this question is 1.

Query: Did Executive Understood and clarified the issue raised by Customer. Also if yes set score =4 else =0.
Response: Yes, Executive Understood and clarified the issue raised by Customer. The score of the question is 4.

Query: Did Executive Actively listen and avoided the interruption in call. Also if yes set score =3 else =0.
Response: Yes, Executive actively listened and avoided the interruption in the call. The score for this question is 3.

Query: Did Executive used proper grammar & terms. Also if yes set score =4 else =0.
Response: Yes, Executive used proper grammar & terms. The score for this question is 4.

Query: Did Executive 