Import modules and define folder paths

In [2]:
import os
import os
import sys
import logging
from dotenv import load_dotenv
from pathlib import Path
import glob

# Llama Index imports for LMM 
from llama_index import (
    ServiceContext,  set_global_service_context
)
from llama_index.llms import AzureOpenAI
from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index.readers import SimpleDirectoryReader

# Import Azure Cognitive Services speech SDK for TTS
import azure.cognitiveservices.speech as speechsdk

load_dotenv(override=True)

logging.basicConfig(
    # stream=sys.stdout, level=logging.DEBUG
    stream=sys.stdout, level=logging.INFO

)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


root_folder_path = Path("C:/Users/61052067/data/pptread")
# input_folder_name = "life_sciences_regulatory_101"
input_folder_name = "life_sciences_regulatory_101_test"
# input_folder_name = "life_sciences_regulatory_101_sample1"
process_folder_path = root_folder_path / input_folder_name

Create a service Context with LLM, Multimodal Model and embedding model. Only Multimodal model is used in the implementation. 
LLM and embedding models are will be needed if future implementation requires and index to be created e.g. a summary index to created high level overview of the presentation content

In [4]:
api_key = os.getenv("OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT")
api_version = os.getenv("AZURE_OPENAI_API_VERSION")

api_key_lmm = os.getenv("OPENAI_API_KEY_LMM")
azure_endpoint_lmm = os.getenv("AZURE_OPENAI_API_ENDPOINT_LMM")
api_version_lmm = os.getenv("AZURE_OPENAI_API_VERSION_LMM")

llm = AzureOpenAI(
    engine="gpt-4",
    model="gpt-4",
    # engine="gpt-35-turbo-16k",
    # model="gpt-35-turbo-16k",
    temperature=0.0,
    azure_endpoint= azure_endpoint,
    api_key= api_key,
    api_version=api_version,
)

multi_modal_llm = AzureOpenAIMultiModal(
    engine="gpt-4",
    model="gpt-4-vision-preview",
    # engine="gpt-35-turbo-16k",
    # model="gpt-35-turbo-16k",
    temperature=0.0,
    api_key= api_key_lmm,
    azure_endpoint= azure_endpoint_lmm,
    api_version=api_version_lmm,
    max_tokens = 4096
)

embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    chunk_size=1
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model
)

set_global_service_context(service_context)

Setup and configure Azure Speech Service

In [5]:
speech_key = os.getenv("AZURE_SPEECH_SERVICE_KEY")
service_region = os.getenv("AZURE_SPEECH_SERVICE_REGION")
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
# speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config) # directly to speaker
speech_config.speech_synthesis_voice_name = "en-US-AvaNeural"
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)

One shot example for content generation

In [6]:
one_shot_example = """In the multifaceted ecosystem of life sciences, the organization is structured into several critical business entities, 
each with a distinct role in the journey from concept to market. The Research & Development arm spearheads this voyage with Drug Discovery, 
focusing on patent searches and candidate finalization, followed by Drug Development which encompasses analytical processes and packaging. 
Preclinical Studies and Clinical Trials form the bedrock of product validation. Parallel to these, Manufacturing & Supply Chain Operations 
ensure GMP compliance and seamless material handling, while Quality Control and Assurance uphold the product standards. In the commercial arena, 
Sales and Marketing Operations engage both digital and conventional marketing strategies to promote products and services, 
manage new product launches, and handle adverse events and complaints. Underpinning these specialized domains are the Corporate & Finance, 
Regulatory & IPR, HR, and IT departments, which provide the essential infrastructure to support domain diversity and business breadth. 
Together, these entities illustrate the intricate tapestry of operations within life sciences, where technology solutions are the connective 
tissue that binds these diverse areas into a cohesive, functioning whole."""

topic = "Life Sciences"

Initial version of the prompt. There is a lot of room for improvement

In [7]:
prompt = f"""You are a subject matter expert in {topic}. Your task is to compose an accurate and engaging narrative for this slide, intended to be read out to an audience during presentation.
You will precise while referring to the contents of the slide.
You will not use a first person voice.
Do not use any salutations. 
Here is an example: {one_shot_example}
"""

Generate the text content using Multimodal Model and convert it to voice using the TTS model

In [8]:
# Function for processing a single file
def generate_narrative(folder_path, slide_name, multi_modal_llm, speech_config, prompt):
    image_file_path = f"{folder_path}/{slide_name}.jpg"
    
    # Assuming you have a way to load image data and generate text
    image_documents = SimpleDirectoryReader(input_files=[image_file_path]).load_data()
    response = multi_modal_llm.complete(
        prompt=prompt,  # You need to define the prompt
        image_documents=image_documents,
    )
    
    # Saving the generated text to a file
    text_file_path = f"{folder_path}/{slide_name}.txt"
    with open(text_file_path, 'w') as file:
        file.write(response.text)
    
    # Generating voice file from the text
    voice_file_name = f"{folder_path}/{slide_name}.mp3"
    file_config = speechsdk.audio.AudioOutputConfig(filename=voice_file_name)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)
    speech_synthesizer.speak_text_async(response.text).get()

def generate_narrative_for_all_images(folder_path, multi_modal_llm, speech_config, prompt):
    for image_file_path in glob.glob(f"{folder_path}/*.jpg"):
        file_name = os.path.basename(image_file_path).replace('.jpg', '')
        generate_narrative(folder_path, file_name, multi_modal_llm, speech_config, prompt)

generate_narrative_for_all_images(
            folder_path=process_folder_path, 
            multi_modal_llm=multi_modal_llm, 
            speech_config=speech_config,
            prompt=prompt
        )


INFO:httpx:HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"


Error handling for text to speech conversion. Should be incorporated for production implementation. 

In [None]:
# if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
#     print("Speech synthesized to speaker for text [{}]".format(response.text))
# elif result.reason == speechsdk.ResultReason.Canceled:
#     cancellation_details = result.cancellation_details
#     print("Speech synthesis canceled: {}".format(cancellation_details.reason))
#     if cancellation_details.reason == speechsdk.CancellationReason.Error:
#         if cancellation_details.error_details:
#             print("Error details: {}".format(cancellation_details.error_details))
#     print("Did you update the subscription info?")