# **<h1 align ="middle"><b>Speech to Text, Generating Embeddings, FAISS</b></h1>**

---

> # **Whisper, Embeddings, FAISS**

> - 1. Speech to Text using Whisper
> - (2a. Store / Query transcribed texts from ADW)
> - 2b. Creating Embeddings
> - 3. Apply FAISS for index-optimalisation
> - 4. Save Index and Embeddings as Pickle

---

## **Imports**

In [None]:
#use the tensforlow conda

In [None]:
#!git clone https://github.com/kstathou/vector_engine
#!pip install -r ./vector_engine/requirements.txt

In [None]:
import os
import ocifs
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import ffmpeg
from vector_engine.vector_engine.utils import vector_search, id2details
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# !git clone https://huggingface.co/spaces/openai/whisper
# %cd whisper
# !pip install -r requirements.txt
#!pip install jiwer
# !pip install torchaudio
#!pip install git+https://github.com/openai/whisper.git 
# #!pip install sentence_transformers
# #!pip install faiss-cpu
# !pip install ffmpeg
# !pip install ffmpeg-python

# **1. Speech to Text using Whisper**

## **1.1. Fetch recording from Bucket**

In [None]:
# for demo delete entire local folder
!rm -r /home/datascience/input_recording

In [None]:
#create local folder (in job)
path_input_locally = "/home/datascience/input_recording/" 

try:       
    if not os.path.exists(path_input_locally):         
        os.makedirs(path_input_locally)    

except OSError: 
    print ('Error: Creating directory of input recording')

#copy recording from bucket to local folder
fs = ocifs.OCIFileSystem()
fs.invalidate_cache("oci://West_BP@frqap2zhtzbe/*.mp3")
fs.get("oci://West_BP@frqap2zhtzbe/*.mp3", path_input_locally , recursive=True, refresh=True)

## **1.2 Running Whisper**

### **1.2.1 Detect Language (incorporated in model)**

In [None]:
#load whisper model
model = whisper.load_model("medium")

In [None]:
#for each recording in the folder detect langauge
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            # load audio and pad/trim it to fit 30 seconds
            audio = whisper.load_audio(audio_recording)
            audio = whisper.pad_or_trim(audio)
            

            # make log-Mel spectrogram and move to the same device as the model
            mel = whisper.log_mel_spectrogram(audio).to(model.device)

            # detect the spoken language
            _, probs = model.detect_language(mel)
            print(f"Detected language: {max(probs, key=probs.get)}")

### **1.2.2 Load model and Run Transcription**

In [None]:
output = []
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            #transcribe recording
            result = model.transcribe(audio_recording)
            
            #append result for each recording to list
            output.append(result['text'])
                       
            print(recording + " is transcribed")

#all in dataframe
df_transcriptions = pd.DataFrame(output, columns=['text'])

In [None]:
#show
pd.set_option('display.max_colwidth', None)
df_transcriptions

# **2. Creating Embeddings**

In [None]:
# see: https://towardsdatascience.com/how-to-build-a-semantic-search-engine-with-transformers-and-faiss-dcbea307a0e8

# look into language-agnostic embeddings: https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html

In [None]:
#create an id column. 
df_transcriptions['id_index'] = df_transcriptions.index

In [None]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Check if CUDA is available ans switch to GPU
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

# Convert abstracts to vectors
embeddings = model.encode(df_transcriptions.text.to_list(), show_progress_bar=True)

In [None]:
embeddings.shape

# **3. Apply FAISS**

In [None]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])   #computes distances

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df_transcriptions.id_index.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

## **3.1 Save embeddings and index as pickle**

In [None]:
import pickle

with open('doc_embedding.pickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)

In [None]:
with open('doc_index.pickle', 'wb') as pkl:
    pickle.dump(index, pkl)

In [None]:
# #open embeddings from pickle
# with open('doc_embedding.pickle', 'rb') as pkl:
#     doc_embedding = pickle.load(pkl)

In [None]:
# #open embeddings from pickle
# with open('doc_index.pickle', 'rb') as pkl:
#     doc_index = pickle.load(pkl)

## **3.2 Try an input example**

In [None]:
#set an example number and number of nearest neighbours
x = 100
kx = 10

In [None]:
## test
user_query = "I want to make a complaint about the marketing platform"

In [None]:
def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level
    DistilBERT model and finds similar vectors using FAISS.
    
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I


def id2details(df_transcriptions, I, column):
    return [list(df_transcriptions[df_transcriptions.id_index == idx][column]) for idx in I[0]]

# Querying the index
D, I = vector_search([user_query], model, index, num_results=1)

In [None]:
list_output = id2details(df_transcriptions, I, 'text')
list_output

# **4. Create one file, one Job**

## **4.1 Create one .py File**

In [None]:
%%writefile ./run_me_v1.py

import os
import ocifs
import numpy as np
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import faiss
import pickle
import ffmpeg
from vector_engine.vector_engine.utils import vector_search, id2details                           ## this is different for Job vs notebook because of folder structure
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

#fetch environmment variables:
recording_name = os.environ.get("recording_name", "recording")
print("Fetching environment variable called " + recording_name)

#ffmpeg
os.system("cp ./ffmpeg ./usr/bin/ffmpeg")  #copy file to other direcotry
os.system("cp ./ffprobe ./usr/bin/ffprobe")

os.system("chmod +rwx ./usr/bin/ffprobe  #change permission")
os.system("chmod +rwx ./usr/bin/ffmpeg")


######## ----------------------------------------------------- #################
# Step 1
######## ----------------------------------------------------- #################

#create local folder (in job)
path_input_locally = "/home/datascience/input_recording/" 

try:       
    if not os.path.exists(path_input_locally):         
        os.makedirs(path_input_locally)    

except OSError: 
    print ('Error: Creating directory of input recording')

#copy recording from bucket to local folder
fs = ocifs.OCIFileSystem()
fs.invalidate_cache("oci://West_BP@frqap2zhtzbe/*.mp3")
fs.get("oci://West_BP@frqap2zhtzbe/*.mp3", path_input_locally , recursive=True, refresh=True)

######## ----------------------------------------------------- #################
# Step 2 - Load model and Detect languages
######## ----------------------------------------------------- #################

#load whisper model
model = whisper.load_model("base")

#for each recording in the folder detect langauge
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            # load audio and pad/trim it to fit 30 seconds
            audio = whisper.load_audio(audio_recording)
            audio = whisper.pad_or_trim(audio)
            

            # make log-Mel spectrogram and move to the same device as the model
            mel = whisper.log_mel_spectrogram(audio).to(model.device)

            # detect the spoken language
            _, probs = model.detect_language(mel)
            print(f"Detected language: {max(probs, key=probs.get)}")




######## ----------------------------------------------------- #################
# Step 3 Transcripte recordings
######## ----------------------------------------------------- #################

output = []
for recording in os.listdir(path_input_locally):
        if (recording.endswith(".mp3")):
            
            audio_recording = os.path.join(path_input_locally, recording)
            
            #transcribe recording
            result = model.transcribe(audio_recording)
            
            #append result for each recording to list
            output.append(result['text'])
                       
            print(recording + " is transcribed")

#all in dataframe
df_transcriptions = pd.DataFrame(output, columns=['text'])

######## ----------------------------------------------------- #################
# Step 3 Create embeddings
######## ----------------------------------------------------- #################

#create an id column. 
df_transcriptions['id_index'] = df_transcriptions.index

# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Check if CUDA is available ans switch to GPU
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

# Convert abstracts to vectors
embeddings = model.encode(df_transcriptions.text.to_list(), show_progress_bar=True)

######## ----------------------------------------------------- #################
# Step 4 Apply FAISS
######## ----------------------------------------------------- #################

# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df_transcriptions.id_index.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

print("-------------------------------------------------------")
print("-------------------------------------------------------")
print("Embeddings are created and FAISS is applied. Pickle files or embeddings can be pushed to Vector DB or anything else")
print("-------------------------------------------------------")
print("-------------------------------------------------------")

## **4.2 Create and Trigger Job**

In [None]:
# # publish conda, as we added new/custom packages
# !odsc conda init -b conda_environment_yolov5 -n frqap2zhtzbe -a resource_principal
# !odsc conda publish -s tensorflow28_p38_gpu_v1 --force

In [None]:
#imports
from ads.common.oci_logging import OCILogGroup, OCILog
from ads.jobs import Job, DataScienceJob, PythonRuntime
from datetime import datetime, timedelta
from ads import set_auth

#authentication
set_auth(auth='resource_principal')

In [None]:
## create the job

job = (
    Job(name="job_v1")
    .with_infrastructure(
        DataScienceJob()
        # Configure logging for getting the job run outputs.
        .with_log_group_id("ocid1.loggroup.oc1.eu-frankfurt-1.amaaaaaangencdyajxalcuggjaug57r3ugare7olsk44ts2shyv7azqbxf4q")
        .with_shape_name("VM.Standard2.4")
        #.with_shape_config_details(memory_in_gbs=16, ocpus=5)
        .with_block_storage_size(200)
    )
    .with_runtime(
        PythonRuntime()
        # Specify the service conda environment by slug name.
        .with_custom_conda("oci://conda_environment_yolov5@frqap2zhtzbe/conda_environments/gpu/TensorFlow 2.8 for GPU on Python 3.8/1.0/tensorflow28_p38_gpu_v1")
        # Source code of the job, can be local or remote.
        .with_source("/home/datascience/")
        #Environment variable
        .with_environment_variable(recording_name="Default variable")
        .with_entrypoint("./run_me_v1.py")
    )
)

job.create()

In [None]:
job_run_env = job.run(
    name="job_run_v1",
    env_var={"recording_name": "An example environment variable. Could passed to this Job"}
)

job_run_watch = job_run_env.watch()