In [1]:
from google.cloud import speech

client = speech.SpeechClient()

gcs_uri = "gs://cloud-samples-data/speech/brooklyn_bridge.raw"

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

response = client.recognize(config = config, audio = audio)

# for result in response.results:
#     print("Transcript: {}".format(result.alternatives[0].transcript))

I0000 00:00:1723812620.996535    5723 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


In [3]:
response.results[0].alternatives[0].transcript

'how old is the Brooklyn Bridge'

## Using LangChain

In [2]:
from langchain_core.messages import HumanMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
import json

## Using VertexAI

In [1]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import json
from google.cloud import speech

In [24]:
with open("environments/env.json") as f:
    env = json.load(f)

client = speech.SpeechClient()
gcs_uri = env["sample_audio_file"]
audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz = 16000,
    language_code="en-US",
)

s2t_response = client.recognize(config = config, audio = audio)

vertexai.init(project = env["project_id"], location=env["location"])
multimodal_model = GenerativeModel("gemini-1.5-flash-001")
for result in s2t_response.results:
    response = multimodal_model.generate_content(
        [
            result.alternatives[0].transcript
        ]
    )
    print("Transcript: {}".format(result.alternatives[0].transcript))
    print("Answer: {}".format(response.candidates[0].content.parts[0]))

Transcript: how old is the Brooklyn Bridge
Answer: text: "The Brooklyn Bridge was completed in **1883**, so it\'s currently **140 years old** (as of 2023). \n"



In [4]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
import vertexai.preview.generative_models as generative_models
import json

def generate(project_id, location, human_text):
    vertexai.init(project=project_id, location=location)
    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE
        ),
    ]
    instrcution = """
            ## Condition
            - You are English professional teacher. 
            - Point out uncorrect word and grammer.
            - Generate three to five recommended useful phrase instead of unnatural tone.
            """
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 1,
        "top_p": 0.95,
    }

    model = GenerativeModel(
        "gemini-1.5-flash-001",
        system_instruction=[instrcution]
    )
    response = model.generate_content(
        [
            human_text
        ],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=False,
    )

    return response

with open("environments/env.json") as f:
    env = json.load(f)

human_text = """I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you."""

response = generate(env["project_id"], env["location"], human_text)

print("Transcript: {}".format(human_text))
print("Answer: {}".format(response.candidates[0].content.parts[0]))



I0000 00:00:1723443168.305417   26301 check_gcp_environment.cc:61] BIOS data file does not exist or cannot be opened.


Transcript: I'm Yusuke, and living in Tokyo, and working as IT engineer at the company.
In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.
Thank you, and nice to meet you.
Answer: text: "Let\'s take a look at your introduction and make it even better! \n\n**Uncorrected Sentence:** I\'m Yusuke, and living in Tokyo, and working as IT engineer at the company.\n\n**Correction:** I\'m Yusuke, and I live in Tokyo. I work as an IT engineer at [Name of Company].\n\n**Uncorrected Sentence:** In my free time, I like to go to gym to workout, watching Netfilx, go hiking, and playing video game.\n\n**Correction:**  In my free time, I enjoy going to the gym to work out, watching Netflix, hiking, and playing video games.\n\n**Recommended Phrases:** \n\n1. **Instead of \"Thank you, and nice to meet you,\" try:**  \"It\'s a pleasure to meet you!\"\n2. **To add a touch of personality:**  \"I\'m really enjoying life in Tokyo.\" \n3. **To emphasize your i

In [8]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason, Part
import vertexai.preview.generative_models as generative_models
import json

audio_file_uri = "gs://english-learning-feedback/first_five_min.mp3"

def generate_from_audio(project_id, location, audio_file_uri):
    vertexai.init(project=project_id, location=location)
    instrcution = """
            Can you transcribe this English conversation class, in the format of timecode, speaker, caption.
            Speakers are teacher, who is woman and high toned voice, and student, who is man and low toned voice.
            Please follow this format:
            00:00 Teacher Hello.
            00:02 Student Hi.
            ...
            """
    audio_file = Part.from_uri(audio_file_uri, mime_type="audio/mpeg")

    model = GenerativeModel("gemini-1.5-flash-001")
    response = model.generate_content(
        [
            audio_file,
            instrcution,
        ],
    )

    return response

with open("environments/env.json") as f:
    env = json.load(f)

response = generate_from_audio(env["project_id"], env["location"], audio_file_uri)

print("Answer: {}".format(response.text))


Answer: 00:00 Teacher Hello.
00:02 Student Hi.
00:04 Teacher Hello.
00:06 Student Hi.
00:08 Teacher Uh can you hear me and see me?
00:13 Student Yeah, I can hear you but I can, I can't see you.
00:20 Teacher Oh okay. Uh one second.
00:25 Student Okay.
00:27 Teacher Can you see me now?
00:30 Student Yeah, that's fine. Yeah, I'm okay.
00:34 Teacher Alright, how are you this day?
00:38 Student Yeah, I'm, I'm good and uh, yeah, sorry, I am good, yes.
00:46 Teacher Okay, that's good. Uh do you want to start with self-introduction?
00:52 Student Yes, uh let me introduce myself briefly.
00:57 Teacher Okay.
01:01 Student Okay, uh my name is Yusuke and now I'm 31 years old, and I'm living in Tokyo and working at the company as an IT engineer, and uh in my free time, I like to go to gym, to workout, uh watching Netflix, go to uh go hiking, and uh playing video games. So, that's my introduction. Nice to meet you.
01:54 Teacher Thank you, nice to meet you too, thanks for sharing. Um okay, well, uh

In [9]:
with open("assets/output.txt", "w", encoding="utf-8") as file:
  file.write(response.text)

In [6]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part

vertexai.init(project="ksst-genai-app", location="asia-northeast1")

model = GenerativeModel("gemini-1.5-flash-001")

prompt = """
Can you transcribe this English conversation class, in the format of timecode, speaker, caption.
Speakers are woman, who is teacher and high toned voice, and man, who is student and low toned voice.
Output should follow this JSON format without code-block:
{
    "Teacher": {
        "0:00": ["I", "am", "Mike"],
        "0:01": ["How", "are", "you"],
        ...
    },
    "Student": {
        "0:02": ["I'm", "good"],
        "0:03": ["How", "about", "you"],
        ...
    },
}
"""

audio_file_uri = "gs://english-learning-feedback/first_five_min.mp3"
audio_file = Part.from_uri(audio_file_uri, mime_type="audio/mpeg")

contents = [audio_file, prompt]

response = model.generate_content(contents)
print(response.text)

{
    "Teacher": {
        "0:00": ["Hello"],
        "0:03": ["Hello"],
        "0:06": ["Uh", "can", "you", "hear", "me", "and", "see", "me"],
        "0:17": ["Oh", "okay", "one", "second"],
        "0:24": ["Okay"],
        "0:28": ["Can", "you", "see", "me", "now"],
        "0:35": ["Alright", "how", "are", "you", "today"],
        "0:45": ["Okay", "that's", "good"],
        "0:51": ["Uh", "do", "you", "want", "to", "start", "with", "self", "introduction"],
        "1:06": ["Okay"],
        "1:11": ["Okay", "well", "uh", "my", "name", "is", "Ray"],
        "1:22": ["Uh", "I", "live", "in", "Cairo", "Egypt"],
        "1:33": ["And", "um", "I", "teach", "English", "full", "time"],
        "1:45": ["Uh", "in", "my", "free", "time", "I", "like", "uh", "to", "read", "books", "listen", "to", "podcasts", "and", "watch", "movies", "and", "TV", "shows", "and", "things", "like", "that"],
        "2:15": ["That's", "all", "thank", "you"],
        "2:24": ["Thank", "you", "too", "and", "let's

In [4]:
print(response)

candidates {
  content {
    role: "model"
    parts {
      text: "```json\n{\n    \"Speaker A\": {\n        \"0:00\": [\"Hallo\"],\n        \"0:02\": [\"Hallo\"],\n        \"0:04\": [\"Uh\", \"can\", \"you\", \"hear\", \"me\", \"and\", \"see\", \"me\"],\n        \"0:12\": [\"Okay\"],\n        \"0:18\": [\"Can\", \"you\", \"see\", \"me\", \"now\"],\n        \"0:23\": [\"All\", \"right\", \"how\", \"are\", \"you\", \"today\"],\n        \"0:32\": [\"Okay\", \"that\'s\", \"good\"],\n        \"0:38\": [\"Uh\", \"do\", \"you\", \"want\", \"to\", \"start\", \"with\", \"self\", \"introduction\"],\n        \"0:51\": [\"Okay\"],\n        \"0:56\": [\"Okay\", \"well\", \"uh\", \"my\", \"name\", \"is\", \"Ray\"],\n        \"1:06\": [\"Uh\", \"I\", \"live\", \"in\", \"Cairo\", \"Egypt\"],\n        \"1:17\": [\"And\", \"um\", \"I\", \"teach\", \"English\", \"full\", \"time\"],\n        \"1:31\": [\"Uh\", \"in\", \"my\", \"free\", \"time\", \"I\", \"like\", \"uh\", \"to\", \"read\", \"books\", \"li

In [4]:
from pydub import AudioSegment

# Load the audio file
audio = AudioSegment.from_file("assets/lesson.webm")

# Calculate the half point of the audio
five_min = len(audio) // 6

# # Split the audio into two halves
first_five_min = audio[:five_min]
# second_half = audio[half_point:]

# # Export the two halves as MP3 files
# first_half.export("assets/first_half.mp3", format="mp3")
# second_half.export("second_half.mp3", format="mp3")

first_five_min.export("assets/first_five_min.mp3", format="mp3")

<_io.BufferedRandom name='assets/first_five_min.mp3'>

In [None]:
print()
