In [56]:
import os
import os
import sys
import logging
from dotenv import load_dotenv
from pathlib import Path
import win32com.client

# Combining imports from the same module/package
from llama_index import (
    VectorStoreIndex, ServiceContext,  set_global_service_context
)
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index.readers import SimpleDirectoryReader
from llama_index.node_parser import UnstructuredElementNodeParser
from llama_index.readers.file.flat_reader import FlatReader
from llama_index.schema import IndexNode
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.agent import OpenAIAgent

load_dotenv(override=True)

current_dir = os.getcwd()
# Go up two levels to the root directory
root_dir = os.path.dirname(os.path.dirname(current_dir))

inputFilePath = os.path.join(root_dir, 'data', 'input', 'Life Sciences Regulatory 101.pptx')
output_folder = os.path.join(root_dir, 'data', 'output')

logging.basicConfig(
    # stream=sys.stdout, level=logging.DEBUG
    stream=sys.stdout, level=logging.INFO

)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


In [None]:
# import aspose.slides as slides
# import aspose.pydrawing as drawing

# with slides.Presentation(inputFilePath) as presentation:
#     for slide in presentation.slides:
#         slide.get_thumbnail(2, 2).save("presentation_slide_{0}.jpg".format(str(slide.slide_number)), drawing.imaging.ImageFormat.jpeg)

In [15]:
import os
from pathlib import Path

powerpoint = win32com.client.Dispatch("PowerPoint.Application")

# Open the PowerPoint file
# presentation_path = "path_to_your_presentation.pptx"  # Update this path
presentation = powerpoint.Presentations.Open(inputFilePath)

# Make sure the script waits until the presentation is fully loaded
powerpoint.Visible = True

# Path where you want to save the JPEG files
image_folder = f"{output_folder}\images" 
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

print(image_folder)
# Loop through each slide in the presentation and save as JPEG
for i, slide in enumerate(presentation.Slides):
    slide_name = f"slide_{i+1}.jpg"
    slide.Export(os.path.join(image_folder, slide_name), "JPG")

# Clean up the PowerPoint application
presentation.Close()
powerpoint.Quit()


c:\Users\61052067\repos\ppt-to-voice\data\output\images


In [16]:
api_key = os.getenv("OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_API_ENDPOINT")
api_version = os.getenv("AZURE_OPENAI_API_VERSION")

api_key_lmm = os.getenv("OPENAI_API_KEY_LMM")
azure_endpoint_lmm = os.getenv("AZURE_OPENAI_API_ENDPOINT_LMM")
api_version_lmm = os.getenv("AZURE_OPENAI_API_VERSION_LMM")

llm = AzureOpenAI(
    engine="gpt-4",
    model="gpt-4",
    # engine="gpt-35-turbo-16k",
    # model="gpt-35-turbo-16k",
    temperature=0.0,
    azure_endpoint= azure_endpoint,
    api_key= api_key,
    api_version=api_version,
)

multi_modal_llm = AzureOpenAIMultiModal(
    engine="gpt-4",
    model="gpt-4-vision-preview",
    # engine="gpt-35-turbo-16k",
    # model="gpt-35-turbo-16k",
    temperature=0.0,
    api_key= api_key_lmm,
    azure_endpoint= azure_endpoint_lmm,
    api_version=api_version_lmm,
    max_tokens = 4096
)

embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name="text-embedding-ada-002",
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    chunk_size=1
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

set_global_service_context(service_context)

In [60]:
image_path =  f"{image_folder}/slide_5.jpg"
image_documents = SimpleDirectoryReader(input_files=[image_path]).load_data()

response = multi_modal_llm.complete(
    prompt="Please make a paragraph based on this slide that can be read out to the audience during the presentation. Do not include any salutations. Use 3rd person voice",
    image_documents=image_documents,
)
print(response)


INFO:httpx:HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
HTTP Request: POST https://entaipoc3.openai.azure.com//openai/deployments/gpt-4/chat/completions?api-version=2023-12-01-preview "HTTP/1.1 200 OK"
The organization of Life Sciences Business Entities is structured to encompass both domain diversity and business breadth. On the domain diversity side, we have Research and Discovery, which includes target identification and drug development, as well as Preclinical Studies and Analytical/Process Development. The Manufacturing and Supply Chain Operations include GMP Manufacturing, Quality Control and M

In [57]:
import azure.cognitiveservices.speech as speechsdk
speech_key = os.getenv("AZURE_SPEECH_SERVICE_KEY")
service_region = os.getenv("AZURE_SPEECH_SERVICE_REGION")

In [58]:
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.speech_synthesis_voice_name = "en-US-AvaNeural"
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)

In [59]:
result = speech_synthesizer.speak_text_async(response.text).get()

In [None]:
def speech_synthesis_to_mp3_file():
    """performs speech synthesis to a mp3 file"""
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Sets the synthesis output format.
    # The full list of supported format can be found here:
    # https://docs.microsoft.com/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
    # Creates a speech synthesizer using file as audio output.
    # Replace with your own audio file name.
    speech_config.speech_synthesis_voice_name = "en-US-AvaNeural"
    file_name = f"{output_folder}/outputaudio.mp3"
    file_config = speechsdk.audio.AudioOutputConfig(filename=file_name)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)

    result = speech_synthesizer.speak_text_async(response).get
    # Receives a text from console input and synthesizes it to mp3 file.
        # Check result
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized audio was saved to [{}]".format(file_name))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))

In [None]:
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
    print("Speech synthesized to speaker for text [{}]".format(response.text))
elif result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = result.cancellation_details
    print("Speech synthesis canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        if cancellation_details.error_details:
            print("Error details: {}".format(cancellation_details.error_details))
    print("Did you update the subscription info?")