In [1]:
from google.cloud import speech

client = speech.SpeechClient()

gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

response = client.recognize(config = config, audio = audio)

# for result in response.results:
#     print("Transcript: {}".format(result.alternatives[0].transcript))

I0000 00:00:1723812620.996535    5723 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [3]:
response.results[0].alternatives[0].transcript

'how old is the Brooklyn Bridge'

## Using LangChain

In [2]:
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
import json

## Using VertexAI

In [1]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import json
from google.cloud import speech

In [24]:
with open("environments/env.json") as f:
    env = json.load(f)

client = speech.SpeechClient()
gcs_uri = env["sample_audio_file"]
audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

s2t_response = client.recognize(config = config, audio = audio)

vertexai.init(project = env["project_id"], location=env["location"])
multimodal_model = GenerativeModel("gemini-1.5-flash-001")
for result in s2t_response.results:
    response = multimodal_model.generate_content(
        [
            result.alternatives[0].transcript
        ]
    )
    print("Transcript: {}".format(result.alternatives[0].transcript))
    print("Answer: {}".format(response.candidates[0].content.parts[0]))

Transcript: how old is the Brooklyn Bridge
Answer: text: "The Brooklyn Bridge was completed in **1883**, so it\'s currently **140 years old** (as of 2023). \n"



In [4]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
import vertexai.preview.generative_models as generative_models
import json

def generate(project_id, location, human_text):
    vertexai.init(project=project_id, location=location)
    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
    ]
    instrcution = """
            ## Condition
            - You are English professional teacher. 
            - Point out uncorrect word and grammer.
            - Generate three to five recommended useful phrase instead of unnatural tone.
            """
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 1,
        "top_p": 0.95,
    }

    model = GenerativeModel(
        "gemini-1.5-flash-001",
        system_instruction=[instrcution]
    )
    response = model.generate_content(
        [
            human_text
        ],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )

    return response

with open("environments/env.json") as f:
    env = json.load(f)

human_text = """I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you."""

response = generate(env["project_id"], env["location"], human_text)

print("Transcript: {}".format(human_text))
print("Answer: {}".format(response.candidates[0].content.parts[0]))



I0000 00:00:1723443168.305417   26301 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


Transcript: I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you.
Answer: text: "Let\'s take a look at your introduction and make it even better! \n\n**Uncorrected Sentence:** I\'m Yusuke, and living in Tokyo, and working as IT engineer at the company.\n\n**Correction:** I\'m Yusuke, and I live in Tokyo. I work as an IT engineer at [Name of Company].\n\n**Uncorrected Sentence:** In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.\n\n**Correction:**  In my free time, I enjoy going to the gym to work out, watching Netflix, hiking, and playing video games.\n\n**Recommended Phrases:** \n\n1. **Instead of \"Thank you, and nice to meet you,\" try:**  \"It\'s a pleasure to meet you!\"\n2. **To add a touch of personality:**  \"I\'m really enjoying life in Tokyo.\" \n3. **To emphasize your i

In [16]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason, Part
import vertexai.preview.generative_models as generative_models
import json

def generate_from_audio(project_id, location, audio_file_uri):
    vertexai.init(project=project_id, location=location)
    instrcution = """
            ## Condition
            - You are English professional.
            - Audio's situation is online English conversation class.
            - Man voice is student and woman voice is teacher.
            - You should summarize student's mistake of grammar and words.
            - You should summarize teacher's pointed out.
            - Generate advanced words and phrases related with this conversation.
            """
    audio_file = Part.from_uri(audio_file_uri, mime_type="audio/mpeg")

    model = GenerativeModel("gemini-1.5-flash-001")
    response = model.generate_content(
        [
            audio_file,
            instrcution,
        ],
        # generation_config=generation_config,
        # safety_settings=safety_settings,
        # stream=False,
    )

    return response

with open("environments/env.json") as f:
    env = json.load(f)

audio_file_uri = "gs://english-learning-feedback/first_half.mp3"

response = generate_from_audio(env["project_id"], env["location"], audio_file_uri)

print("Answer: {}".format(response.text))


Answer: text: "## Summary of Student\'s Mistakes\n\nThe student made several grammatical errors throughout the conversation:\n\n* **Incorrect verb tense:** He often used the past tense when the present tense was required, like saying \"cannot stop to watch\" instead of \"cannot stop watching\" and \"is able to prevent\" instead of \"able to prevent\".\n* **Incorrect word choice:** He misused words like \"promote\" and \"regret\" when he meant \"encourage\" and \"ignore\".\n* **Incomplete sentences:** He sometimes left sentences unfinished, such as when discussing the reasons for bullying.\n\n## Summary of Teacher\'s Corrections\n\nThe teacher corrected the student\'s grammar and word choices, highlighting the importance of precision in language. She also clarified the difference between \"promoting\" and \"facilitating,\" emphasizing that promoting bullying would be actively encouraging it, while facilitating it would simply make it easier to occur.\n\n## Advanced Words and Phrases\n\n

In [17]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part

# TODO(developer): Update and un-comment below lines
# project_id = "PROJECT_ID"

vertexai.init(project="ksst-genai-app", location="asia-northeast1")

model = GenerativeModel("gemini-1.5-flash-001")

prompt = """
Can you transcribe this interview, in the format of timecode, speaker, caption.
Use speaker A, speaker B, etc. to identify speakers.
"""

# prompt = """
# ## Condition
# - You are English professional.
# - Audio's situation is online English conversation class.
# - Man voice is student and woman voice is teacher.
# - You should summarize student's mistake of grammar and words.
# - You should summarize teacher's pointed out.
# - Generate advanced words and phrases related with this conversation.
# """

audio_file_uri = "gs://english-learning-feedback/first_half.mp3"
audio_file = Part.from_uri(audio_file_uri, mime_type="audio/mpeg")

contents = [audio_file, prompt]

response = model.generate_content(contents)
print(response.text)

0:00 Speaker A Hello.
0:03 Speaker B Hi.
0:06 Speaker A Hello.
0:08 Speaker B Hi.
0:11 Speaker A Uh, can you hear me and see me?
0:15 Speaker B Yeah, I can hear you but I can't I can't see you.
0:24 Speaker A Okay.
0:26 Speaker B Uh, one second.
0:29 Speaker A Okay.
0:35 Speaker A Can you see me now?
0:38 Speaker B Yeah, that's fine. Yeah, I'm okay.
0:42 Speaker A All right, how are you today?
0:46 Speaker B Yeah, I'm I'm good, and uh Yeah. Sorry, I have I have good. Yes.
0:58 Speaker A Okay, that's good. Uh, do you want to start with self-introduction?
1:06 Speaker B Yes. Uh, I'll let me introduce myself briefly.
1:17 Speaker A Okay.
1:20 Speaker B Okay. Uh, my name is Yusuke, and now I'm 31 years old and I'm living in Tokyo and working at the company as an IT engineer. And uh, in my free time I like to go to gym to work out. Uh, watching Netflix, go to uh go hiking. And uh, playing video game. So, that's my introduction. Nice to meet you.
1:59 Speaker A Thank you. Nice to meet you, t

In [10]:
from pydub import AudioSegment

# Load the audio file
audio = AudioSegment.from_file("data/lesson.webm")

# Calculate the half point of the audio
half_point = len(audio) // 2

# Split the audio into two halves
first_half = audio[:half_point]
second_half = audio[half_point:]

# Export the two halves as MP3 files
first_half.export("first_half.mp3", format="mp3")
second_half.export("second_half.mp3", format="mp3")

<_io.BufferedRandom name='second_half.mp3'>