<a href="https://colab.research.google.com/github/arunpiyush25/Practice-Project/blob/main/speaker_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install requests pydub



In [12]:
# Step 2: Import libraries
import requests
import time
import io
from google.colab import files
from IPython.display import Audio, display
import json


In [13]:
# Step 3: AssemblyAI Transcriber Class
class AssemblyAITranscriber:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.assemblyai.com/v2"
        self.headers = {"authorization": api_key}

    def transcribe_with_diarization(self, audio_file_path):
        print("📤 Uploading audio file...")

        # Upload audio file
        with open(audio_file_path, 'rb') as f:
            upload_response = requests.post(
                f"{self.base_url}/upload",
                headers=self.headers,
                files={"file": f}
            )

        if upload_response.status_code != 200:
            raise Exception(f"Upload failed: {upload_response.text}")

        audio_url = upload_response.json()["upload_url"]
        print("✅ Audio uploaded successfully!")

        # Request transcription with speaker diarization
        print("🔄 Starting transcription...")
        transcript_request = {
            "audio_url": audio_url,
            "speaker_labels": True,
            "speakers_expected": 2  # Adjust as needed
        }

        response = requests.post(
            f"{self.base_url}/transcript",
            json=transcript_request,
            headers=self.headers
        )

        if response.status_code != 200:
            raise Exception(f"Transcription request failed: {response.text}")

        transcript_id = response.json()["id"]
        print(f"📋 Transcript ID: {transcript_id}")

        # Poll for completion
        print("⏳ Waiting for transcription to complete...")
        while True:
            response = requests.get(
                f"{self.base_url}/transcript/{transcript_id}",
                headers=self.headers
            )

            result = response.json()
            status = result["status"]

            if status == "completed":
                print("✅ Transcription completed!")
                return result
            elif status == "error":
                raise Exception(f"Transcription failed: {result.get('error', 'Unknown error')}")
            elif status in ["queued", "processing"]:
                print(f"🔄 Status: {status}")
                time.sleep(10)
            else:
                print(f"⚠️ Unknown status: {status}")
                time.sleep(5)

In [14]:
# Step 4: Main function
def transcribe_audio():
    # PASTE YOUR API KEY HERE
    API_KEY = "f964722193e045a486316af56455fd26"  # Replace with your actual key

    print("🎤 Audio Transcription with Speaker Diarization")
    print("=" * 50)

    # Upload file
    print("📁 Please upload your audio file:")
    uploaded = files.upload()

    if not uploaded:
        print("❌ No file uploaded!")
        return

    # Get the uploaded file
    audio_file = list(uploaded.keys())[0]
    print(f"📄 Processing file: {audio_file}")

    try:
        # Initialize transcriber
        transcriber = AssemblyAITranscriber(API_KEY)

        # Transcribe
        result = transcriber.transcribe_with_diarization(audio_file)

        # Display results
        print("\n🎯 TRANSCRIPTION RESULTS:")
        print("=" * 50)

        # Show utterances with speakers
        if "utterances" in result and result["utterances"]:
            for i, utterance in enumerate(result["utterances"], 1):
                speaker = utterance.get('speaker', 'Unknown')
                text = utterance.get('text', '')
                start = utterance.get('start', 0) / 1000  # Convert to seconds
                end = utterance.get('end', 0) / 1000

                print(f"{i}. Speaker {speaker} [{start:.2f}s - {end:.2f}s]:")
                print(f"   {text}")
                print()
        else:
            # Fallback to full text if no utterances
            print("Full Transcript:")
            print(result.get('text', 'No transcription available'))

        # Save results
        output_file = f"transcript_{audio_file.split('.')[0]}.txt"

        with open(output_file, 'w') as f:
            f.write("AUDIO TRANSCRIPTION RESULTS\n")
            f.write("=" * 40 + "\n\n")

            if "utterances" in result and result["utterances"]:
                for utterance in result["utterances"]:
                    speaker = utterance.get('speaker', 'Unknown')
                    text = utterance.get('text', '')
                    start = utterance.get('start', 0) / 1000
                    end = utterance.get('end', 0) / 1000

                    f.write(f"Speaker {speaker} [{start:.2f}s - {end:.2f}s]: {text}\n")
            else:
                f.write(result.get('text', 'No transcription available'))

        print(f"💾 Results saved to: {output_file}")

        # Download the results
        files.download(output_file)

    except Exception as e:
        print(f"❌ Error: {str(e)}")


In [15]:
# Execute this cell to start the process
transcribe_audio()

🎤 Audio Transcription with Speaker Diarization
📁 Please upload your audio file:


Saving mc1.mp3 to mc1.mp3
📄 Processing file: mc1.mp3
📤 Uploading audio file...
✅ Audio uploaded successfully!
🔄 Starting transcription...
📋 Transcript ID: 8cae1c63-e160-4d0b-917e-925b03de2974
⏳ Waiting for transcription to complete...
🔄 Status: processing
✅ Transcription completed!

🎯 TRANSCRIPTION RESULTS:
1. Speaker A [0.32s - 1.12s]:
   I'm Jenny.

2. Speaker B [1.68s - 5.36s]:
   I'm Jason. Nice to meet you. Nice to meet you, too.

💾 Results saved to: transcript_mc1.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>