<a href="https://colab.research.google.com/github/amal-lahchim-cntxt/new/blob/main/peaker_diarization_and_transcription_using_Whisper_and_Pyannote_within_Label_Studio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai-whisper

In [None]:
!pip install transformers

In [None]:
!pip install pyannote.audio

In [None]:
from label_studio_sdk import Client
import requests
from pyannote.audio import Pipeline
from tqdm import tqdm
import whisper

# Load API key from environment variables or configuration file
API_KEY = "your_api_key_here"
LABEL_STUDIO_URL = "https://your_label_studio_url_here"

client = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)

try:
    response = requests.get(f"{LABEL_STUDIO_URL}/api/projects/", headers={'Authorization': f'Token {API_KEY}'})
    response.raise_for_status()
    print("Connection successful. Projects data:", response.json())
except requests.exceptions.RequestException as e:
    print("Failed to connect to Label Studio API:", e)

# Load diarization pipeline
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token="your_huggingface_auth_token_here"
)

# Load Whisper model
whisper_model = whisper.load_model("large")

# Define project ID (replace with your actual project ID)
project_id = 939
project = client.get_project(project_id)
tasks = project.get_tasks()

for task in tqdm(tasks):
    url = f'{LABEL_STUDIO_URL}{task["data"]["audio"]}'
    response = requests.get(url, headers={'Authorization': f'Token {API_KEY}'}, stream=True)

    if response.status_code == 200:
        try:
            audio_path = "temp_audio.mp3"
            with open(audio_path, "wb") as f:
                f.write(response.content)

            # Perform diarization
            diarization_result = diarization_pipeline({"uri": task["id"], "audio": audio_path})
            transcription_result = whisper_model.transcribe(audio_path)
            segments = []

            for segment in transcription_result['segments']:
                start, end, text = segment['start'], segment['end'], segment['text']
                speaker = 'Unknown'
                for turn, _, spk in diarization_result.itertracks(yield_label=True):
                    if turn.start <= start <= turn.end or turn.start <= end <= turn.end:
                        speaker = spk
                        break
                segments.append({
                    'from_name': 'speaker_transcription',
                    'to_name': 'audio',
                    'type': 'labels',
                    'value': {
                        'start': start,
                        'end': end,
                        'labels': [speaker],
                        'text': [text]
                    }
                })
            prediction = {
                'result': segments,
                'score': 1.0,
                'model_version': "whisper_diarization_combined"
            }
            project.create_prediction(
                task_id=task['id'],
                result=prediction['result'],
                score=prediction['score'],
                model_version=prediction['model_version']
            )

        except Exception as e:
            print(f"Error processing audio for task {task['id']}: {e}")
    else:
        print(f"Failed to fetch audio for task {task['id']}. Status code: {response.status_code}")

