In [5]:
from google.cloud import speech

client = speech.SpeechClient()

gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

response = client.recognize(config = config, audio = audio)

for result in response.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))

I0000 00:00:1723354289.821774    2075 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


Transcript: how old is the Brooklyn Bridge


In [7]:
len(response.results)

1

## Using LangChain

In [2]:
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
import json

## Using VertexAI

In [4]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import json
from google.cloud import speech

In [24]:
with open("environments/env.json") as f:
    env = json.load(f)

client = speech.SpeechClient()
gcs_uri = env["sample_audio_file"]
audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

s2t_response = client.recognize(config = config, audio = audio)

vertexai.init(project = env["project_id"], location=env["location"])
multimodal_model = GenerativeModel("gemini-1.5-flash-001")
for result in s2t_response.results:
    response = multimodal_model.generate_content(
        [
            result.alternatives[0].transcript
        ]
    )
    print("Transcript: {}".format(result.alternatives[0].transcript))
    print("Answer: {}".format(response.candidates[0].content.parts[0]))

Transcript: how old is the Brooklyn Bridge
Answer: text: "The Brooklyn Bridge was completed in **1883**, so it\'s currently **140 years old** (as of 2023). \n"



In [4]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
import vertexai.preview.generative_models as generative_models
import json

def generate(project_id, location, human_text):
    vertexai.init(project=project_id, location=location)
    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
    ]
    instrcution = """You are English professional teacher. Please point out uncorrect word and grammer. 
And also please generate three to five recommended useful phrase instead of unnatural tone."""
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 1,
        "top_p": 0.95,
    }

    model = GenerativeModel(
        "gemini-1.5-flash-001",
        system_instruction=[instrcution]
    )
    response = model.generate_content(
        [
            human_text
        ],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )

    return response

with open("environments/env.json") as f:
    env = json.load(f)

human_text = """I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you."""

response = generate(env["project_id"], env["location"], human_text)

print("Transcript: {}".format(human_text))
print("Answer: {}".format(response.candidates[0].content.parts[0]))



I0000 00:00:1723358958.255696    3911 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


Transcript: I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you.
Answer: text: "Let\'s break down the sentence and make it sound more natural:\n\n**Incorrect Word/Grammar:**\n\n* **\"living in Tokyo\"**:  This is grammatically correct but can be improved.\n* **\"working as IT engineer at the company\"**:  This is a bit awkward. It\'s better to mention the company name if you\'re comfortable sharing.\n* **\"watching Netfilx\"**: The correct spelling is \"Netflix\".\n* **\"go hiking\"**: This should be \"hiking\".\n* **\"playing video game\"**:  This should be \"playing video games\".\n\n**Recommended Phrases:**\n\nHere are some more natural ways to phrase your introduction:\n\n1. **\"Hi, I\'m Yusuke. I live in Tokyo and work as an IT engineer at [company name].\"** \n2. **\"My name is Yusuke, and I\'m an IT engineer based in Tokyo.

In [3]:
print(response)

<generator object _GenerativeModel._generate_content_streaming at 0x7fc73fed4900>
