
### **<h1 align ="middle"><b>Speech to Text & Qdrant Vector Database</b></h1>**

---

> - 1. Speech to Text using Whisper
> - 2. Store / Query transcribed texts from ADW)
> - 3. Encode Texts (embeddings) and push to Qdrant Vector DB

---

## **Imports**

In [None]:
import os
import ocifs
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import ffmpeg
import ocifs
from json import loads, dumps
from qdrant_client import models, QdrantClient
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# **1. Speech to Text using Whisper**

## **1.1. Fetch recording from Bucket**

In [None]:
# for demo delete entire local folder
!rm -r /home/datascience/input_recording

In [None]:
#create local folder (in job)
path_input_locally = "/home/datascience/input_recording/"
bucket = "oci://West_BP@frqap2zhtzbe/"

try:       
    if not os.path.exists(path_input_locally):         
        os.makedirs(path_input_locally)    

except OSError: 
    print ('Error: Creating directory of input recording')

#copy recording from bucket to local folder
fs = ocifs.OCIFileSystem()
print(fs.ls(bucket))
fs.invalidate_cache(bucket)
fs.get(bucket, path_input_locally, recursive=True, refresh=True)

## **1.2 Running Whisper**

### **1.2.1 Detect Language (incorporated in model)**

In [None]:
#load whisper model
model = whisper.load_model("medium")

In [None]:
#for each recording in the folder detect langauge
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            # load audio and pad/trim it to fit 30 seconds
            audio = whisper.load_audio(audio_recording)
            audio = whisper.pad_or_trim(audio)
            

            # make log-Mel spectrogram and move to the same device as the model
            mel = whisper.log_mel_spectrogram(audio).to(model.device)

            # detect the spoken language
            _, probs = model.detect_language(mel)
            print(f"Detected language: {max(probs, key=probs.get)}")

### **1.2.2 Load model and Run Transcription**

In [None]:
output = []

##ffmpeg
!wget -O - -q  https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz | xz -qdc| tar -x
    
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            #transcribe recording
            result = model.transcribe(audio_recording)
            
            #append result for each recording to list
            output.append(result['text'])
                       
            print(recording + " is transcribed")

#all in dataframe
df_transcriptions = pd.DataFrame(output, columns=['text'])

In [None]:
#show
pd.set_option('display.max_colwidth', None)
df_transcriptions

# **2. Qdrant**

## **2.1 Encode Sentences, Transform into JSON package, and Load into Vector DB**

In [None]:
encoder = SentenceTransformer('multi-qa-distilbert-cos-v1') # Model to create embeddings

In [None]:
## transform df_transcritiopons to correct format for Qrant
texts = df_transcriptions.to_json(orient = 'records')
documents = loads(texts)

In [None]:
#example of the format it should have

# documents = [
#   { "name": "The Time Machine", "description": "A man travels through time and witnesses the evolution of humanity.", "author": "H.G. Wells", "year": 1895 },
#   { "name": "Ender's Game", "description": "A young boy is trained to become a military leader in a war against an alien race.", "author": "Orson Scott Card", "year": 1985 },
#   { "name": "Brave New World", "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.", "author": "Aldous Huxley", "year": 1932 }
# ]

In [None]:
#establish connection to Qdrant vector database
qdrant = QdrantClient("xxxx", port=6333) # Create in-memory Qdrant instance

# Create collection to store books
qdrant.recreate_collection(
    collection_name="collection_v1",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)



# Let's vectorize descriptions and upload to qdrant
qdrant.upload_records(
    collection_name="collection_v1",
    records=[
        models.Record(
            id=idx,
            vector=encoder.encode(doc["text"]).tolist(),
            payload=doc
        ) for idx, doc in enumerate(documents)
    ]
)

## **2.2 Example Search**

In [None]:
# Search on a sentence
hits = qdrant.search(
    collection_name="collection_v1",
    query_vector=encoder.encode("What are people saying about mortgages services?").tolist(),
    limit=3
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

In [None]:
# Specific filter

# hits = qdrant.search(
#     collection_name="collection_v1",
#     query_vector=encoder.encode("Tyranic society").tolist(),
#     query_filter=models.Filter(
#         must=[
#             models.FieldCondition(
#                 key="year",
#                 range=models.Range(
#                     gte=2000
#                 )
#             )
#         ]
#     ),
#     limit=3
# )
# for hit in hits:
#     print(hit.payload, "score:", hit.score)

# **4. Create one file, one Job**

## **4.1 Create one .py File**

In [None]:
%%writefile ./job_artifact/job_v1.py

import os
import ocifs
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import ffmpeg
import ocifs
from json import loads, dumps
from qdrant_client import models, QdrantClient
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

#fetch environmment variables:
recording_name = os.environ.get("recording_name", "recording")
print("Fetching environment variable called " + recording_name)


######## ----------------------------------------------------- #################
# Step 1
######## ----------------------------------------------------- #################

#create local folder (in job)
path_input_locally = "/home/datascience/input_recording/"
bucket = "oci://West_BP@frqap2zhtzbe/xx/"

try:       
    if not os.path.exists(path_input_locally):         
        os.makedirs(path_input_locally)    

except OSError: 
    print ('Error: Creating directory of input recording')

#copy recording from bucket to local folder
fs = ocifs.OCIFileSystem()
print(fs.ls(bucket))
fs.get(bucket, path_input_locally, recursive=True, refresh=True)

######## ----------------------------------------------------- #################
# Step 2 - Load model and Detect languages
######## ----------------------------------------------------- #################

#load whisper model
model = whisper.load_model("base")

#for each recording in the folder detect langauge
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            # load audio and pad/trim it to fit 30 seconds
            audio = whisper.load_audio(audio_recording)
            audio = whisper.pad_or_trim(audio)
            

            # make log-Mel spectrogram and move to the same device as the model
            mel = whisper.log_mel_spectrogram(audio).to(model.device)

            # detect the spoken language
            _, probs = model.detect_language(mel)
            print(f"Detected language: {max(probs, key=probs.get)}")




######## ----------------------------------------------------- #################
# Step 3 Transcripte recordings
######## ----------------------------------------------------- #################

output = []

##ffmpeg
os.system("wget -O - -q  https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-linux64-gpl.tar.xz | xz -qdc| tar -x")
    
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            #transcribe recording
            result = model.transcribe(audio_recording)
            
            #append result for each recording to list
            output.append(result['text'])
                       
            print(recording + " is transcribed")

#all in dataframe
df_transcriptions = pd.DataFrame(output, columns=['text'])

######## ----------------------------------------------------- #################
# Step 3 Create embeddings
######## ----------------------------------------------------- #################

encoder = SentenceTransformer('multi-qa-distilbert-cos-v1') # Model to create embeddings

## transform df_transcritiopons to correct format for Qrant
texts = df_transcriptions.to_json(orient = 'records')
documents = loads(texts)

#establish connection to Qdrant vector database
qdrant = QdrantClient("138.3.241.32", port=6333) # Create in-memory Qdrant instance

# Create collection to store books
qdrant.recreate_collection(
    collection_name="collection_v1",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)



# Let's vectorize descriptions and upload to qdrant
qdrant.upload_records(
    collection_name="collection_v1",
    records=[
        models.Record(
            id=idx,
            vector=encoder.encode(doc["text"]).tolist(),
            payload=doc
        ) for idx, doc in enumerate(documents
    ]
)

print("-------------------------------------------------------")
print("-------------------------------------------------------")
print("Encoded Text / Embeddings are pushed to Qdrant")
print("-------------------------------------------------------")
print("-------------------------------------------------------")

## **4.2 Create and Trigger Job**

In [None]:
# # # publish conda, as we added new/custom packages
# !odsc conda init -b conda_environment_yolov5 -n frqap2zhtzbe -a resource_principal
# !odsc conda publish -s pytorch20_p39_gpu_v1 --force

In [None]:
#imports
from ads.common.oci_logging import OCILogGroup, OCILog
from ads.jobs import Job, DataScienceJob, PythonRuntime
from datetime import datetime, timedelta
from ads import set_auth

#authentication
set_auth(auth='resource_principal')

In [None]:
## create the job

job = (
    Job(name="job_v6")
    .with_infrastructure(
        DataScienceJob()
        # Configure logging for getting the job run outputs.
        .with_log_group_id("ocid1.loggroup.oc1.eu-frankfurt-1.amaaaaaangencdyajxalcuggjaug57r3ugare7olsk44ts2shyv7azqbxf4q")
        .with_shape_name("VM.Standard2.4")
        #.with_shape_config_details(memory_in_gbs=16, ocpus=5)
        .with_block_storage_size(200)
    )
    .with_runtime(
        PythonRuntime()
        # Specify the service conda environment by slug name.
        .with_custom_conda("oci://conda_environment_yolov5@frqap2zhtzbe/conda_environments/gpu/PyTorch 2.0 for GPU on Python 3.9/1.0/pytorch20_p39_gpu_v1")
        # Source code of the job, can be local or remote.
        .with_source("/home/datascience/job_artifact")
        #Environment variable
        .with_environment_variable(recording_name="Default variable")
        .with_entrypoint("job_artifact/job_v1.py")
    )
)

job.create()

In [None]:
job_run_env = job.run(
    name="job_run_vxx",
    env_var={"recording_name": "An example environment variable. Could passed to this Job"}
)

job_run_watch = job_run_env.watch()