In [None]:
from IPython.display import clear_output

In [None]:
!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install -U sentence-transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer
!pip install hazm
!pip install yake
!pip install multi_rake
!pip install fasttext
clear_output()

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
!gzip -d cc.fa.300.bin.gz
clear_output()

In [None]:
import librosa
import os
import gc
import json
import torch
import hazm
import yake
import numpy as np
import IPython.display as ipd
from multi_rake import Rake
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from sentence_transformers import models, SentenceTransformer, util

## ASR & Keyword Extraction Section


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian")
model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-persian").to(device)
clear_output()

In [None]:
def transcribe_dataset(dataset):
    utterances = dataset.keys()
    for utterance in utterances:
        waveform, sample_rate = librosa.load(utterance, sr=16000)
        for chunk in dataset[utterance]:
            start_time = chunk["start_time"]
            end_time = chunk["end_time"]
            start_sample = int(start_time * sample_rate)
            end_sample = int(end_time * sample_rate)
            audio_segment = waveform[start_sample:end_sample]

            input_values = processor(audio_segment, sampling_rate=sample_rate, return_tensors="pt").input_values
            input_values = input_values.to(device)
            logits = model(input_values).logits
            predicted_ids = np.argmax(logits.cpu().detach().numpy(), axis=-1)
            transcription = processor.decode(predicted_ids[0])
            
            chunk["transcription"] = transcription

    return dataset

In [None]:
def extract_keywords_yake(transcription):
    # Tokenize the transcription using Hazm
    normalizer = hazm.Normalizer()
    tokenizer = hazm.WordTokenizer()
    tokens = tokenizer.tokenize(normalizer.normalize(transcription))
    tokens = [token for token in tokens if token not in hazm.stopwords_list()]

    # Extract keywords using YAKE
    extractor = yake.KeywordExtractor()
    keywords = extractor.extract_keywords(' '.join(tokens))
    keywords = [k[0] for k in keywords]
    keywords = list(map(lambda x: x.replace('\u200c', ''), keywords))

    return keywords


def extract_keywords_rake(transcription):
    # Extract keywords using multi_rake
    stopwords = hazm.stopwords_list()
    rake = Rake(
        min_chars=3,
        max_words=3,
        min_freq=1,
        language_code=None,  
        stopwords=None, 
        lang_detect_threshold=50,
        max_words_unknown_lang=2,
        generated_stopwords_percentile=80,
        generated_stopwords_max_len=3,
        generated_stopwords_min_freq=2,
    )
    keywords = rake.apply(transcription)
    keywords = [kw[0].replace('\u200c', '') for kw in keywords]

    return keywords

In [None]:
def generate_dataset_keywords(dataset_file):
    with open(dataset_file, 'r') as f:
        dataset = json.load(f)  
    
    dataset = transcribe_dataset(dataset)
    utterances = dataset.keys()
    for utterance in utterances:
        for chunk in dataset[utterance]:
            chunk["keywords"] = extract_keywords_yake(chunk["transcription"])

    transcribed_dataset_file = 'transcribed_' + dataset_file
    with open(transcribed_dataset_file, "w", encoding='utf-8') as outFile:
        json.dump(dataset, outFile, ensure_ascii=False, indent=1)

    return dataset

## Query and Dataset Comparison Section

In [None]:
def cosine_similarity(vec1, vec2):
    dot_product = sum([vec1[i] * vec2[i] for i in range(len(vec1))])
    norm1 = sum([vec1[i] ** 2 for i in range(len(vec1))]) ** 0.5
    norm2 = sum([vec2[i] ** 2 for i in range(len(vec2))]) ** 0.5
    return dot_product / (norm1 * norm2)

In [None]:
def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model


def get_results(dataset_file, query, N):

    with open(dataset_file, 'r') as f:
        dataset = json.load(f)

    st_model = load_st_model('m3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens')
    query_keywords = extract_keywords_yake(query)

    # Compute average embedding of query keywords
    query_embedding = st_model.encode(query)

    similarity_scores = []
    for utterance in dataset.keys():
        for chunk in dataset[utterance]:
            chunk_keywords = ' '.join(chunk['keywords'])
            chunk_embedding = st_model.encode(chunk_keywords)
            similarity_scores.append((utterance, chunk['start_time'], chunk['end_time'],
                                      chunk['transcription'], cosine_similarity(query_embedding, chunk_embedding)))
            
    # Sort similarity scores in descending order
    similarity_scores.sort(key=lambda x: x[4], reverse=True)

    # Get top N most similar records
    return similarity_scores[:N]

In [None]:
def usage(dataset, query, N):
    if not os.path.isfile(os.getcwd() + 'transcribed_' + dataset):
        print("Generating Dataset Keywords...")
        generate_dataset_keywords('dataset.json')

    return get_results('transcribed_' + dataset, query, N)

In [None]:
query = "جهان آغازین"
usage('dataset.json', query, 3)

Generating Dataset Keywords...


[('sina-farsi.wav',
  14.5,
  23,
  'ستاما جهان آغازین فضایی عجیبتر با ستارههای غولپیکرد بود که به سرعت زندگی میکردند و در سن جوانی میمردند',
  0.5871606050725281),
 ('sina-farsi.wav',
  0.0,
  14.5,
  'بر اساس پژبوهشی جدید اولین ستارههای کیهان تا بیش از ده هزار برابر جرم خورشید رشد کردند و هزار برابر بزرگتر از بزرگترین ستارههای کنونی بودند امروزه جرم بزرگترین ستارهها صد جرم خورشید',
  0.3525935654811955),
 ('sina-farsi.wav',
  23,
  30,
  'با مرگ این الماسهاید رخشان دیگر هرگز شرایط شکلگیری آنها فراهم نشد',
  0.25459224942676356)]