In [6]:
import os
import sys
import librosa
import whisper
import noisereduce as nr
import soundfile as sf
from simple_diarizer.diarizer import Diarizer
from config import MONGODB_URI, MONGODB_DATABASE_NAME, MONGODB_COLLECTION_NAME
import numpy as np
from pymongo import MongoClient

# Load the Whisper model for transcription
whisper_model = whisper.load_model("medium")

def process_audio(file_path):
    # Load and denoise the audio file
    audio, sr = librosa.load(file_path, sr=None)
    reduced_noise_audio = nr.reduce_noise(y=audio, sr=sr, prop_decrease=0.9, stationary=True)
    
    # Save the denoised audio to a temporary file
    denoised_audio_file = '/tmp/denoised_audio.wav'
    sf.write(denoised_audio_file, reduced_noise_audio, sr)

    # Perform speaker diarization using simple_diarizer
    diarization = Diarizer(embed_model='xvec', cluster_method='sc')
    segments = diarization.diarize(denoised_audio_file, num_speakers=2)

    # Process each speaker segment and transcribe
    speaker_transcriptions = []
    current_speaker = None

    # Reload the denoised audio data for segmentation
    audio, sr = librosa.load(denoised_audio_file, sr=None)

    # Connect to MongoDB
    client = MongoClient(MONGODB_URI)
    db = client[MONGODB_DATABASE_NAME]
    collection = db[MONGODB_COLLECTION_NAME]

    for segment in segments:
        start_time = segment['start']
        end_time = segment['end']
        speaker_label = segment['label']  # The speaker label (0, 1, etc.)

        # Convert the start and end times from seconds to sample indices
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        # Extract the audio segment corresponding to the current speaker's time frame
        segment_audio = audio[start_sample:end_sample]

        # Save the extracted audio segment to a temporary file for Whisper to transcribe
        temp_segment_path = f"/tmp/temp_speaker_{speaker_label}_{int(start_time)}.wav"
        sf.write(temp_segment_path, segment_audio, sr)

        # Transcribe the audio segment using Whisper, forcing it to use English language
        transcription_result = whisper_model.transcribe(temp_segment_path, language="en")
        transcription_text = transcription_result["text"]

        # Handle the first transcription block
        if current_speaker is None or current_speaker != speaker_label:
            # Add the new transcription block for the first speaker
            transcription_document = {
                "speaker": int(speaker_label),  # Convert numpy.int32 to native int
                "text": transcription_text,
                "start_time": float(start_time),  # Convert to float for MongoDB
                "end_time": float(end_time)       # Convert to float for MongoDB
            }
            # Insert the transcription document into MongoDB
            result = collection.insert_one(transcription_document)
            print(f"Inserted document with ID: {result.inserted_id}")

            # Add the transcription block to the list
            speaker_transcriptions.append({
                "speaker": speaker_label,
                "text": transcription_text,
                "start_time": start_time,
                "end_time": end_time
            })
        else:
            # If the same speaker is continuing, append the transcription and extend the end time
            speaker_transcriptions[-1]["text"] += " " + transcription_text
            speaker_transcriptions[-1]["end_time"] = float(end_time)

        # Delete the temporary file after processing
        os.remove(temp_segment_path)

        # Update the current speaker for the next iteration
        current_speaker = speaker_label

    # Delete the denoised audio file after processing
    os.remove(denoised_audio_file)

    # Output the speaker transcriptions with timestamps
    for block in speaker_transcriptions:
        print(f"Speaker {block['speaker']}:")
        print(f"Text: {block['text']}")
        print(f"Timestamp: {block['start_time']:.2f}s to {block['end_time']:.2f}s\n")

    return speaker_transcriptions

# Add the if __name__ == "__main__" block here:
if __name__ == "__main__":
    # Get the file path from the command-line arguments
    if len(sys.argv) < 2:
        print("Usage: python script.py <file_path>")
        sys.exit(1)
    
    file_path = '/Users/junjie/mmRag/backend/SIT_NVIDIA_MEETING_PART1.wav' # Get the file path from the first argument
    process_audio(file_path)


  checkpoint = torch.load(fp, map_location=device)
Using cache found in /Users/junjie/.cache/torch/hub/snakers4_silero-vad_master
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading p

Running VAD...
Splitting by silence found 60 utterances
Extracting embeddings...


Utterances: 100%|██████████| 60/60 [00:00<00:00, 102.37it/s]
  adjacency = check_symmetric(adjacency)


Clustering to 2 speakers...
Cleaning up output...
Done!
Inserted document with ID: 676a839fc7c0f5b9c5fd5fad




Inserted document with ID: 676a83a5c7c0f5b9c5fd5fae




Inserted document with ID: 676a83e6c7c0f5b9c5fd5faf




Inserted document with ID: 676a83f1c7c0f5b9c5fd5fb0




Inserted document with ID: 676a8454c7c0f5b9c5fd5fb1




Inserted document with ID: 676a8466c7c0f5b9c5fd5fb2




Inserted document with ID: 676a8471c7c0f5b9c5fd5fb3




Inserted document with ID: 676a8477c7c0f5b9c5fd5fb4




Inserted document with ID: 676a847dc7c0f5b9c5fd5fb5




Inserted document with ID: 676a8488c7c0f5b9c5fd5fb6




Inserted document with ID: 676a8493c7c0f5b9c5fd5fb7




Inserted document with ID: 676a8499c7c0f5b9c5fd5fb8




Speaker 1:
Text:  So as mentioned, audio preprocessing, speech diarization, then as I mentioned audio preprocessing.
Timestamp: 4.10s to 8.49s

Speaker 0:
Text:  do it like a front end.  So, and you also provided me with a research paper to work on.  Um...  So for this...  We should have created like a React front then. So.  I have two portions, one for file upload.  Uh...  and one for all.  Shutting.  Yeah.  This is the end of the video. Thanks for watching.
Timestamp: 8.49s to 39.29s

Speaker 1:
Text:  to ask questions.  So let's see.
Timestamp: 39.59s to 42.08s

Speaker 0:
Text:  If I  I blew it.  I'll foul now over here.  And I try. Hope it works. Yep.  Okay, so.   I forgot my IP.  It should reflect here accordingly. So this is just part of the pipeline.  Yeah.  So once I do this, it should propagate to...  Bye.  It should basically process the file.  And then.  propagate all the way to no sequel.  But for that I haven't linked to the...  vector db.  So that.
Timestamp: 42.31s to 9

In [None]:
# This portion i want to use openai gpt 4o mini to output me the text in the following format which i will then use to store in mongodb



## ZZ


In [None]:
import os
import sys
import librosa
import whisper
import noisereduce as nr
import soundfile as sf
from simple_diarizer.diarizer import Diarizer
from pymongo import MongoClient
import os
from config import MONGODB_URI, MONGODB_DATABASE_NAME, MONGODB_COLLECTION_NAME, OPENAI_API_KEY
from openai import OpenAI  # You need to install openai library for GPT-3 or GPT-4 integration
from dotenv import load_dotenv


load_dotenv()
client = OpenAI()
# OPENAI_API_KEY_4O = os.getenv(OPENAI_API_KEY)
# Load the Whisper model for transcription
whisper_model = whisper.load_model("medium")


# OpenAI API Key Configuration (Replace with your actual key)
openai.api_key = OPENAI_API_KEY

def generate_summary_and_action_items(transcription_text):
    # Define the prompt to extract summary, action items, and decisions
    prompt = f"""
    You are given a transcript of a meeting, with dialogue between different speakers. Your task is to summarize the content covered in the meeting and extract the action items in the format below. 
    Please focus on the key topics discussed, decisions made, and action items that need to be followed up on. The format should include:
    
    Meeting Summary: A high-level summary of the key topics discussed in the meeting.
    Action Items: A list of actionable tasks, including who is responsible for each task and when it needs to be completed.
    Decisions Made: A list of any decisions that were made during the meeting.
    
    The input for this prompt will be in the format here:

    {transcription_text}
    """
    
    # Call OpenAI GPT model to generate structured content (summary, action items, decisions)
    response = client.chat.completions.create(
        
        model="gpt-4o-mini",  # Use "gpt-3.5-turbo" or "gpt-4" if available
        messages=[{"role": "user", "content": prompt}],
        max_tokens=16384,  # You can adjust the token limit based on your needs
        temperature=0
    )
    
    return response.choices[0].message.content

def store_meeting_data(file_id, file_name, file_content):
    # Connect to MongoDB
    client = MongoClient(MONGODB_URI)
    db = client[MONGODB_DATABASE_NAME]
    collection = db[MONGODB_COLLECTION_NAME]
    
    # Create the document to store in MongoDB
    meeting_data = {
        "file_id": file_id,
        "file_name": file_name,
        "file_content": file_content,
        "timestamp": os.path.getmtime(file_name)  # You can use the file modification time as a timestamp
    }
    
    # Insert the meeting data into MongoDB
    result = collection.insert_one(meeting_data)
    print(f"Inserted document with ID: {result.inserted_id}")

def process_audio(file_path):
    # Load and denoise the audio file
    audio, sr = librosa.load(file_path, sr=None)
    reduced_noise_audio = nr.reduce_noise(y=audio, sr=sr, prop_decrease=0.9, stationary=True)
    
    # Save the denoised audio to a temporary file
    denoised_audio_file = '/tmp/denoised_audio.wav'
    sf.write(denoised_audio_file, reduced_noise_audio, sr)

    # Perform speaker diarization using simple_diarizer
    diarization = Diarizer(embed_model='xvec', cluster_method='sc')
    segments = diarization.diarize(denoised_audio_file, num_speakers=2)

    # Process each speaker segment and transcribe
    speaker_transcriptions = []
    current_speaker = None

    # Reload the denoised audio data for segmentation
    audio, sr = librosa.load(denoised_audio_file, sr=None)

    # Generate the transcription text
    transcription_text = ""
    for segment in segments:
        start_time = segment['start']
        end_time = segment['end']
        speaker_label = segment['label']

        # Convert the start and end times from seconds to sample indices
        start_sample = int(start_time * sr)
        end_sample = int(end_time * sr)

        # Extract the audio segment corresponding to the current speaker's time frame
        segment_audio = audio[start_sample:end_sample]

        # Save the extracted audio segment to a temporary file for Whisper to transcribe
        temp_segment_path = f"/tmp/temp_speaker_{speaker_label}_{int(start_time)}.wav"
        sf.write(temp_segment_path, segment_audio, sr)

        # Transcribe the audio segment using Whisper, forcing it to use English language
        transcription_result = whisper_model.transcribe(temp_segment_path, language="en")
        transcription_text += transcription_result["text"] + " "

        # Delete the temporary file after processing
        os.remove(temp_segment_path)

        # Update the current speaker for the next iteration
        current_speaker = speaker_label

    # Delete the denoised audio file after processing
    os.remove(denoised_audio_file)

    # Generate the structured summary and action items using the transcription text
    structured_content = generate_summary_and_action_items(transcription_text)

    # Prepare to store the meeting data in MongoDB
    file_id = os.path.basename(file_path)  # You can use file name or generate a unique ID
    file_name = file_path
    store_meeting_data(file_id, file_name, structured_content)

# Add the if __name__ == "__main__" block here:
if __name__ == "__main__":
    # Get the file path from the command-line arguments
    if len(sys.argv) < 2:
        print("Usage: python script.py <file_path>")
        sys.exit(1)
    
    file_path = '/Users/junjie/mmRag/backend/PART2.wav'  # Get the file path from the first argument
    process_audio(file_path)


  checkpoint = torch.load(fp, map_location=device)
Using cache found in /Users/junjie/.cache/torch/hub/snakers4_silero-vad_master
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-xvect-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading p

Running VAD...
Splitting by silence found 8 utterances
Extracting embeddings...


Utterances: 100%|██████████| 8/8 [00:00<00:00, 96.71it/s]

Clustering to 2 speakers...
Cleaning up output...
Done!



  adjacency = check_symmetric(adjacency)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Inserted document with ID: 676a900fc7c0f5b9c5fd5fbc


In [4]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer
import uuid

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name="mm_collection",
    vectors_config=VectorParams(size=4, distance=Distance.DOT),
)

  from .autonotebook import tqdm as notebook_tqdm


True

In [11]:
from qdrant_client import QdrantClient, models

client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name="mm_collection",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
)

INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/mm_collection "HTTP/1.1 200 OK"


True

In [7]:
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore

client = QdrantClient(url="http://localhost:6333")

# construct vector store
vector_store = QdrantVectorStore(
    client=client,
    collection_name='mm_collection',
)

In [11]:
from sentence_transformers import SentenceTransformer
from qdrant_client.http.models import PointStruct, VectorParams, Distance
import uuid

# Load the pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')


client = QdrantClient(url="http://localhost:6333")

# Example documents to embed
documents = [
    "Meeting 1 summary: We discussed project deadlines and assigned tasks.",
    "Meeting 2 summary: The budget was approved, and the project manager was assigned.",
    # Add your other meeting documents here
]

# Generate embeddings (vectors) for each document
embeddings = model.encode(documents)

# Insert vectors into Qdrant
for i, embedding in enumerate(embeddings):
    point = PointStruct(
        id=str(uuid.uuid4()),  # Unique ID for each document
        vector=embedding.tolist(),  # Convert embedding to list format
        payload={"text": documents[i]},  # Store the original document as payload
    )
    client.upsert(
        collection_name="mm_collection",
        points=[point]
    )


In [6]:
import json

from openai import OpenAI
from config import OPENAI_API_KEY, WHISPER_MODEL, DIARIZER
openai_client = OpenAI()

def semantic_chunker(transcription_text):
    """
    This function sends the transcription text to an LLM, asking it to break the text into 
    semantically meaningful chunks, and returns the output in a JSON format.
    
    Parameters:
        transcription_text (str): The text of the meeting transcription.
    
    Returns:
        str: A JSON string containing the semantically chunked text with metadata.
    """

    prompt = f"""
    You are given a transcript of a meeting with dialogue between different speakers. 
    Your task is to break the conversation into meaningful chunks and provide the output in JSON format.
    Each chunk should have a "text" field containing the chunk content and a "metadata" field with relevant details like category (e.g., Meeting Summary, Action Items, etc.).

    The format should be:
    [
        {{
            "text": "chunk text here",
            "metadata": {{
                "category": "category name here"
            }}
        }},
        ...
    ]
    
    Here's the meeting transcription text:

    {transcription_text}
    """

    try:
        # Send the prompt to the LLM model (assuming it's set up for edge functions)
        response = openai_client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=16384,  # Adjust the token limit based on your needs
            temperature=0
        )

        refined_text = response.choices[0].message.content
        return refined_text
        
    except Exception as e:
        print(f"Error occurred: {e}")
        return json.dumps({"error": "An error occurred during semantic chunking."})

# Example usage
transcription_text = """
Meeting Summary: The meeting focused on the quality of audio files and their impact on processing accuracy.

Action Items:
1. Investigate methods to improve audio file quality and reduce noise.
2. Review the current denoising techniques being used and assess their effectiveness.

Decisions Made: It was agreed that audio file quality is critical for achieving accurate results.
"""

print(semantic_chunker(transcription_text))


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


```json
[
    {
        "text": "The meeting focused on the quality of audio files and their impact on processing accuracy.",
        "metadata": {
            "category": "Meeting Summary"
        }
    },
    {
        "text": "Investigate methods to improve audio file quality and reduce noise.",
        "metadata": {
            "category": "Action Items"
        }
    },
    {
        "text": "Review the current denoising techniques being used and assess their effectiveness.",
        "metadata": {
            "category": "Action Items"
        }
    },
    {
        "text": "It was agreed that audio file quality is critical for achieving accurate results.",
        "metadata": {
            "category": "Decisions Made"
        }
    }
]
```


In [10]:
import json

def clean_file_content(file_content):
    """
    Cleans the given file_content by removing unnecessary characters 
    and extracting the JSON data.

    Parameters:
        file_content (str): The raw file_content with extra characters.

    Returns:
        dict: A cleaned dictionary containing the parsed JSON data.
    """
    try:
        # Remove the enclosing ```json and backticks
        cleaned_content = file_content.strip('```json').strip('```')
        
        # Remove newlines and excess whitespace
        cleaned_content = cleaned_content.replace("\n", "").replace("\\n", "").strip()
        
        # Parse the cleaned JSON string into a Python dictionary
        parsed_content = json.loads(cleaned_content)
        
        return parsed_content
    except Exception as e:
        print(f"Error cleaning file_content: {e}")
        return None


# Example usage
file_content = """```json\n[\n    {\n        \"text\": \"Okay, basically pre-processing. I looked at denoising which you mentioned. I also looked at... I learned some new terms. Something about amplitude. Like you want to scale it or something. Yeah, and another thing called normalization. So these are the three things I tried to do.\",\n        \"metadata\": {\n            \"category\": \"Discussion\",\n            \"speaker\": \"Speaker 0\",\n            \"filename\": \"/tmp/PART5.wav\",\n            \"timestamp\": \"2025-01-03\"\n        }\n    },\n    {\n        \"text\": \"You mentioned denoiser library I tried that but I don't know whether it's because I did it wrongly. So the results...\",\n        \"metadata\": {\n            \"category\": \"Discussion\",\n            \"speaker\": \"Speaker 0\",\n            \"filename\": \"/tmp/PART5.wav\",\n            \"timestamp\": \"2025-01-03\"\n        }\n    },\n    {\n        \"text\": \"The audio file I eventually got, which I'm not gonna play here because it destroys my ears. Basically it's not good. I tried with two other libraries, notably is noise reduced.\",\n        \"metadata\": {\n            \"category\": \"Discussion\",\n            \"speaker\": \"Speaker 0\",\n            \"filename\": \"/tmp/PART5.wav\",\n            \"timestamp\": \"2025-01-03\"\n        }\n    },\n    {\n        \"text\": \"Where's my foul? I... I don't know it's my fault.\",\n        \"metadata\": {\n            \"category\": \"Clarification\",\n            \"speaker\": \"Speaker 1\",\n            \"filename\": \"/tmp/PART5.wav\",\n            \"timestamp\": \"2025-01-03\"\n        }\n    }\n]\n```"""

cleaned_data = clean_file_content(file_content)

# Print cleaned JSON data
print(json.dumps(cleaned_data, indent=4))


[
    {
        "text": "Okay, basically pre-processing. I looked at denoising which you mentioned. I also looked at... I learned some new terms. Something about amplitude. Like you want to scale it or something. Yeah, and another thing called normalization. So these are the three things I tried to do.",
        "metadata": {
            "category": "Discussion",
            "speaker": "Speaker 0",
            "filename": "/tmp/PART5.wav",
            "timestamp": "2025-01-03"
        }
    },
    {
        "text": "You mentioned denoiser library I tried that but I don't know whether it's because I did it wrongly. So the results...",
        "metadata": {
            "category": "Discussion",
            "speaker": "Speaker 0",
            "filename": "/tmp/PART5.wav",
            "timestamp": "2025-01-03"
        }
    },
    {
        "text": "The audio file I eventually got, which I'm not gonna play here because it destroys my ears. Basically it's not good. I tried with two other li