## Utilities

In [None]:
!pip install vosk
!wget https://alphacephei.com/kaldi/models/vosk-model-en-us-0.21.zip
!unzip /content/vosk-model-en-us-0.21.zip

Collecting vosk
  Downloading vosk-0.3.37-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 4.8 MB/s 
Installing collected packages: vosk
Successfully installed vosk-0.3.37
--2022-04-28 12:48:07--  https://alphacephei.com/kaldi/models/vosk-model-en-us-0.21.zip
Resolving alphacephei.com (alphacephei.com)... 188.40.21.16, 2a01:4f8:13a:279f::2
Connecting to alphacephei.com (alphacephei.com)|188.40.21.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1752830350 (1.6G) [application/zip]
Saving to: ‘vosk-model-en-us-0.21.zip’


2022-04-28 12:49:12 (25.9 MB/s) - ‘vosk-model-en-us-0.21.zip’ saved [1752830350/1752830350]

Archive:  /content/vosk-model-en-us-0.21.zip
   creating: vosk-model-en-us-0.21/
   creating: vosk-model-en-us-0.21/am/
  inflating: vosk-model-en-us-0.21/am/final.mdl  
  inflating: vosk-model-en-us-0.21/am/tree  
   creating: vosk-model-en-us-0.21/ivector/
  inflating: vosk-model-en-u

In [None]:
import wave
import pandas as pd
import os
import json
from vosk import Model, KaldiRecognizer, SetLogLevel

## Modelling

### Word Class

In [None]:
class Word:
    ''' A class representing a word from the JSON format for vosk speech recognition API '''

    def __init__(self, dict):
        '''
        Parameters:
          dict (dict) dictionary from JSON, containing:
            conf (float): degree of confidence, from 0 to 1
            end (float): end time of the pronouncing the word, in seconds
            start (float): start time of the pronouncing the word, in seconds
            word (str): recognized word
        '''

        self.conf = dict["conf"]
        self.end = dict["end"]
        self.start = dict["start"]
        self.word = dict["word"]

    def to_string(self):
        ''' Returns a string describing this instance '''
        return "{:20} from {:.2f} sec to {:.2f} sec, confidence is {:.2f}%".format(
            self.word, self.start, self.end, self.conf*100)

### Getting Data from Video-Recommendation-System.git

In [None]:
!git clone https://github.com/atishaye/Video-Recommendation-System.git

Cloning into 'Video-Recommendation-System'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 44 (delta 1), reused 11 (delta 0), pack-reused 30[K
Unpacking objects: 100% (44/44), done.
Checking out files: 100% (30/30), done.


In [None]:
# get all files
import os
transcripts=os.listdir('/content/Video-Recommendation-System/Data/Audio Files')
for i in range(len(transcripts)):
  transcripts[i]='/content/Video-Recommendation-System/Data/Audio Files/'+transcripts[i]
transcripts=sorted(transcripts)
display(transcripts)
dataset_size=len(transcripts)
print(dataset_size)

['/content/Video-Recommendation-System/Data/Audio Files/00_astronomy.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/01_white_dwarf.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/02_chimp.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/03_crypto.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/04_crypto_tax.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/05_crypto_means.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/06_digital_token.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/07_cryptocurrency.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/08_dodo.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/09_dolphin.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/10_elephant.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/11_fd_mutual_finance.wav',
 '/content/Video-Recommendation-System/Data/Audio Files/12_finance.wav',
 '/content/Video-Re

25


### TimeStamping Transcripts

In [None]:
model_path = "./vosk-model-en-us-0.21"

i=22

audio_filename = transcripts[i]
print('Timestamping for', transcripts[i], 'begin!')
model = Model(model_path)
wf = wave.open(audio_filename, "rb")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

# get the list of JSON dictionaries
results = []
# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)
part_result = json.loads(rec.FinalResult())
results.append(part_result)

# convert list of JSON dictionaries to list of 'Word' objects
list_of_Words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition 
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = Word(obj)  # create custom Word object
        list_of_Words.append(w)  # and add it to list

wf.close()  # close audiofile

# output to the screen
# for word in list_of_Words:
#     print(word.to_string())

master_list=[]
for j in range(len(list_of_Words)):
  l=list_of_Words[j].to_string().split(' ')
  while "" in l:
    l.remove("")
  master_list.append(l)

df=pd.DataFrame(master_list)
df=df.drop([1,3,4,6,7,8],axis=1)
transcript_id=[i for _ in range(len(master_list))]
df.insert(0,'transcript_id',transcript_id)
df.columns=['transcript_id','word', 'start_timestamp(sec)', 'end_timestamp(sec)', 'confidence']
print('Timestamping for Transcript',i,'done!')

master_df=pd.read_csv('timestamped_transcripts.csv')
master_df=master_df.append(df,ignore_index=True)
# master_df.to_csv('timestamped_transcripts.csv')
# master_df=pd.read_csv('timestamped_transcripts.csv')
# master_df.drop(master_df.columns[[0, 1]], axis = 1, inplace = True)
master_df.to_csv('timestamped_transcripts.csv', index=False)
from google.colab import files
files.download('timestamped_transcripts.csv') 
master_df
!rm -r /content/vosk-model-en-us-0.21
!rm -r /content/vosk-model-en-us-0.21.zip

Timestamping for /content/Video-Recommendation-System/Data/Audio Files/22_super_nova.wav begin!
Timestamping for Transcript 22 done!


FileNotFoundError: ignored

### Final Retrieved Timestamps

In [None]:
master_df=pd.read_csv('/content/timestamped_transcripts.csv')
master_df['confidence'] = master_df['confidence'].str.strip('%')
master_df.columns=(['transcript_id','keyword', 'start_timestamp(seconds)', 'end_timestamp(seconds)', 'confidence_score(percentage)'])
master_df.to_csv('timestamped_transcripts.csv', index=False)
from google.colab import files
files.download('timestamped_transcripts.csv') 
master_df

### Getting Another Speech-To-Text Transcribing

In [None]:
import pandas as pd
master_df=pd.read_csv('/content/timestamped_transcripts.csv')
master_df

In [None]:
import pandas as pd
raw_transcripts=pd.DataFrame()
for i in range(25):
  content=""
  for j in master_df[master_df['transcript_id']==i]['keyword']:
    content = content + j + ' '
  print(content)
  outfile='/content/Raw Transcripts using Vosk/transcript'+str(i)+'.txt'
  with open(outfile, "w") as text_file:
    text_file.write(content)
  df2 = pd.DataFrame({'transcript_id':[i] ,'raw_transcript':[content]})
  df2['transcript_id'].astype(int)
  raw_transcripts = raw_transcripts.append(df2, ignore_index = True)
raw_transcripts

In [None]:
!zip -r /content/Raw Transcripts_using_Vosk.zip /content/Raw\ Transcripts\ using\ Vosk

In [None]:
from google.colab import files
files.download('/content/sample_data/README.md')

In [None]:
import os
transcripts=os.listdir('/content/Raw Transcripts using Vosk')
for i in range(len(transcripts)):
  transcripts[i]='/content/Raw Transcripts using Vosk/'+transcripts[i]
print(transcripts)
from google.colab import files
for i in transcripts:
  files.download(i)

In [None]:
raw_transcripts.to_csv('raw_transcripts_using_vosk.csv')

### Random Testing

In [None]:
df

In [None]:
master_df=pd.read_csv('timestamped_transcripts.csv')
master_df=master_df.append(df,ignore_index=True)
master_df.to_csv('timestamped_transcripts.csv')
master_df

In [None]:
master_df=pd.read_csv('timestamped_transcripts.csv')
master_df.drop(master_df.columns[0], axis = 1, inplace = True)
master_df.to_csv('timestamped_transcripts.csv',index=False)
master_df

In [None]:
type(list_of_Words)
df=pd.DataFrame(list_of_Words)
df

In [None]:
master_list=[]
for i in range(len(list_of_Words)):
  l=list_of_Words[i].to_string().split(' ')
  while "" in l:
    l.remove("")
  master_list.append(l)

master_list

In [None]:
df=pd.DataFrame(master_list)
df=df.drop([1,3,4,6,7,8],axis=1)
transcript_id=[1 for i in range(len(master_list))]
df.insert(0,'transcript_id',transcript_id)
df.columns=['transcript_id','word', 'start_timestamp(sec)', 'end_timestamp(sec)', 'confidence']
df