## OCI Speech - Transcription 

helpful links
- https://github.com/oracle/oci-python-sdk/tree/22fd62c8dbbd1aaed6b75754ec1ba8a3c16a4e5a/src/oci/ai_speech
- https://docs.oracle.com/en-us/iaas/Content/speech/home.htm
- #oci_speech_service_users or #igiu-innovation-lab slack channel 
- if you have errors running sample code reach out for help in #igiu-ai-learning

## Import Libraries

In [2]:
from oci.ai_speech import AIServiceSpeechClient
from oci.ai_speech.models import *
from oci.config import from_file
from oci.signer import load_private_key_from_file
import oci
from oci.object_storage import ObjectStorageClient
import json,os,io,time

## Set input variables

In [3]:
#####
#make sure your sandbox.json file is setup for your environment. You might have to specify the full path depending on  your `cwd` 
# you can also try making your cwd ofr jupyter match your workspace python code: 
# vscopde menu -> Settings > Extensions > Jupyter > Notebook File Root
# change from ${fileDirname} to ${workspaceFolder}
#####

#SANDBOX_CONFIG_FILE = "~/work/code/python/workshop/sandbox.json"
SANDBOX_CONFIG_FILE = "sandbox.json"

FILE_TO_ANALYZE = "./speech/voice_sample1.mp3"

# Read the config file

In [4]:
scfg = None
# read the sandbox config 
with open(os.path.expanduser(SANDBOX_CONFIG_FILE), 'r') as f:
                scfg=  json.load(f)
oci_cfg = oci.config.from_file(os.path.expanduser(scfg["oci"]["configFile"]),scfg["oci"]["profile"])
bucket_cfg = scfg["bucket"]
namespace = bucket_cfg["namespace"]
bucketName =  bucket_cfg["bucketName"]
filename = os.path.basename(FILE_TO_ANALYZE)
prefix = bucket_cfg['prefix']
compartmentId =scfg["oci"]["compartment"] 


## Upload file  

This is an optional step. If the file is already uploaded, no need to upload it again.

In [5]:
object_storage_client = ObjectStorageClient(oci_cfg)
print(f"Uploading file {FILE_TO_ANALYZE} ...")
object_storage_client.put_object(bucket_cfg["namespace"], 
                                bucket_cfg["bucketName"], 
                                f"{bucket_cfg['prefix']}/{os.path.basename(FILE_TO_ANALYZE)}", 
                                io.open(FILE_TO_ANALYZE,'rb'))
print("Upload completed !")

Uploading file ./speech/voice_sample1.mp3 ...
Upload completed !


## Create AI service Speech client

In [6]:
speech_client =AIServiceSpeechClient(config=oci_cfg,signer= oci.signer.Signer(
        tenancy=oci_cfg["tenancy"],
        user=oci_cfg["user"],
        fingerprint=oci_cfg["fingerprint"],
        private_key_file_location=oci_cfg["key_file"]
        ),
        service_endpoint=" https://speech.aiservice.us-phoenix-1.oci.oraclecloud.com")

## Set the input location

In [7]:
object_location = oci.ai_speech.models.ObjectLocation(namespace_name=namespace, bucket_name=bucketName,
                         object_names=[f"{prefix}/{filename}"]
                         )
input_location = oci.ai_speech.models.ObjectListInlineInputLocation(
            location_type="OBJECT_LIST_INLINE_INPUT_LOCATION", object_locations=[object_location])

## set the output location

In [8]:
output_location = oci.ai_speech.models.OutputLocation(namespace_name=namespace, bucket_name=bucketName, prefix=prefix)

## Setup input feature 
You can specify the features you want to call. Note not all features are supported for all calls 


*******   ONLY RUN Either WHISPER or ORACLE MODELS ******
### Oracle 

In [9]:
# features for Oracle model
sample_normalization = oci.ai_speech.models.TranscriptionNormalization(is_punctuation_enabled=True)

transcription_settings = oci.ai_speech.models.TranscriptionSettings(
        diarization= oci.ai_speech.models.Diarization(is_diarization_enabled=True)  # dosnt specify number_of_speakers as its auto detected
    )

model_details = oci.ai_speech.models.TranscriptionModelDetails(
        language_code="en-US", 
        model_type="ORACLE",
        domain = "GENERIC",   # only generic domain is supported for now
        transcription_settings =transcription_settings 
        )


******* ONLY RUN WHISPER OR ORACLE MODELS ******

### Whisper

In [10]:
# features for Whisper model
sample_normalization = oci.ai_speech.models.TranscriptionNormalization(is_punctuation_enabled=True)

transcription_settings = oci.ai_speech.models.TranscriptionSettings(
        diarization= oci.ai_speech.models.Diarization(is_diarization_enabled=True)  # dosnt specify number_of_speakers as its auto detected
    )
model_details = oci.ai_speech.models.TranscriptionModelDetails(
        language_code="en", 
        model_type="WHISPER_MEDIUM",
        domain = "GENERIC",   # only generic domain is supported for now
        transcription_settings =transcription_settings 
        )


## Create Speech Analysis detaiis

In [13]:
# Create Transcription Job with details provided
transcription_job_details = oci.ai_speech.models.CreateTranscriptionJobDetails(display_name="WorkshopTest",
    compartment_id=compartmentId,
    description="Testing Oracle Speech Speech to Text",
    model_details=model_details,
    input_location=input_location,
    additional_transcription_formats=["SRT"],
    normalization=sample_normalization,
    output_location=output_location
    )

## Run the job


In [14]:
transcription_job = None
try:
    transcription_job = speech_client.create_transcription_job(create_transcription_job_details=transcription_job_details)
except Exception as e:
    print(e)
else:
    transcribe_job_id = transcription_job.data.id
    print(f"Transcription Job ID: {transcribe_job_id}.")

Transcription Job ID: ocid1.aispeechtranscriptionjob.oc1.phx.amaaaaaaghwivzaarvo2xdwyfdgjnxbzrlcujkloolvxobd5ekcmkhzdm7vq.


## Poll the job till it completes

In [15]:
data = None
while True:
    transcribtion_job = speech_client.get_transcription_job(transcribe_job_id)
    job_status = transcribtion_job.data.lifecycle_state
    print(f"Current Status: {job_status}", end='\r')
    
    if job_status == "SUCCEEDED":
        print("\nTranscription job completed successfully!")
        data = transcribtion_job.data        
        break
    elif job_status == "FAILED":
        print("\nTranscription job failed.")
        break
    else:
        time.sleep(5)  # Wait 30 seconds before checking again 

Current Status: SUCCEEDEDSS
Transcription job completed successfully!


## retrive the transcribed files 

as we have also asked for SRT file we will retrive all files in the subdirectory created by the job

In [18]:
object_storage_client = ObjectStorageClient(oci_cfg)
# iterate through all files in the prefix 
list_objects_response = object_storage_client.list_objects(
        namespace_name=namespace,
        bucket_name=bucketName, 
        prefix=data.output_location.prefix
    )
for obj in list_objects_response.data.objects:
    response  = object_storage_client.get_object(namespace, bucketName, obj.name)
    _, file_extension = os.path.splitext(obj.name)
    filename = f"{FILE_TO_ANALYZE}{file_extension}"
    with open(filename,"w") as f:
        f.write(response.data.text)
    print (f"saved {filename}")

saved ./speech/voice_sample1.mp3.json
saved ./speech/voice_sample1.mp3.srt


## Exercise : transcription

1. Create an App that  takes in an audio
    * Uses diarization to transcribe
    * Compare Oracle & Wisper models
    * Compare with original question  ( from tts exercise)

1. Take an zoom recording
    * Transcribe
      * With captions
    * Summarize using llm

