In [12]:
import math
import torch 
import shutil
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import matplotlib.pyplot as plt
from torch_audiomentations import Compose, AddBackgroundNoise
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [20]:
# loading STT models
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
apply_augmentations = Compose(transforms=[AddBackgroundNoise(
                      background_paths='../../data/noise_resources/background_noise/',
                      p=1.0,
                      min_snr_in_db=-2.5,
                      max_snr_in_db=2.5,
                      )])

In [46]:
# setting up device
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
atis_samples_count = 893

In [48]:
original_data = pd.read_csv('../../data/ATIS/experiments/clean/test/test.tsv',sep='\t')
original_label = original_data[['ID','INTENT','INTENT_ID']]

In [49]:
audio, rate = librosa.load("./output_wavernn.wav", sr = 16000)
audio = (torch.from_numpy(audio))
new_audio = torch.unsqueeze(audio,0)
new_audio = torch.unsqueeze(new_audio,0)
perturbed_audio_samples = apply_augmentations(new_audio, sample_rate=16000)
perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)
perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)

input_values = tokenizer(perturbed_audio_samples, return_tensors = "pt").input_values
logits = model(input_values).logits
prediction = torch.argmax(logits, dim = -1)
transcription = tokenizer.batch_decode(prediction)[0]

In [50]:
transcription

'I WOULD LIKE TO FIGHT A FLIGHT FROM CHARLON TO THE LAZAN EGUS THAT MAKES AND STOCK AN SNINGISH'

In [None]:
for idx in range(5):
    print('processing version {}'.format(idx))
    noisy_samples = []
    # reading all the audio samples
    for jdx in list(range(atis_samples_count)):
        
        audio, rate = librosa.load("../../data/ATIS/raw/test/ASR_voice_samples/voice/{}.aiff".format(jdx), sr = 16000)
        audio = (torch.from_numpy(audio))
        new_audio = torch.unsqueeze(audio,0)
        new_audio = torch.unsqueeze(new_audio,0)
        perturbed_audio_samples = apply_augmentations(new_audio, sample_rate=16000)
        perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)
        perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)

        input_values = tokenizer(perturbed_audio_samples, return_tensors = "pt").input_values
        logits = model(input_values).logits
        prediction = torch.argmax(logits, dim = -1)
        transcription = tokenizer.batch_decode(prediction)[0]
        noisy_samples.append(transcription)
    
    original_label['TEXT'] = noisy_samples
    original_label = original_label[["ID","TEXT",'INTENT',"INTENT_ID"]]
    original_label.to_csv('../../data/ATIS/experiments/BG_noise/test/test_{}.csv'.format(idx),sep='\t', index=False)

processing version 0
processing version 1
processing version 2


In [None]:
original_data = pd.read_csv('../../data/SN/experiments/clean/test/test.tsv',sep='\t')
original_label = original_data[['ID','INTENT','INTENT_ID']]

for idx in range(5):
    print('processing version {}'.format(idx))
    noisy_samples = []
    # reading all the audio samples
    for jdx in list(range(atis_samples_count)):
        
        audio, rate = librosa.load("../../data/ATIS/raw/test/ASR_voice_samples/voice/{}.aiff".format(jdx), sr = 16000)
        audio = (torch.from_numpy(audio))
        new_audio = torch.unsqueeze(audio,0)
        new_audio = torch.unsqueeze(new_audio,0)
        perturbed_audio_samples = apply_augmentations(new_audio, sample_rate=16000)
        perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)
        perturbed_audio_samples = torch.squeeze(perturbed_audio_samples,0)

        input_values = tokenizer(perturbed_audio_samples, return_tensors = "pt").input_values
        logits = model(input_values).logits
        prediction = torch.argmax(logits, dim = -1)
        transcription = tokenizer.batch_decode(prediction)[0]
        noisy_samples.append(transcription)
    
    original_label['TEXT'] = noisy_samples
    original_label = original_label[["ID","TEXT",'INTENT',"INTENT_ID"]]
    original_label.to_csv('../../data/ATIS/experiments/BG_noise/test/test_{}.csv'.format(idx))

# selecting noise samples from the dump

In [None]:
meta = pd.read_csv('../data/noise_resources/ESC-50/meta/esc50.csv')
meta.head()

In [None]:
file_target = meta[['filename','target']]
file_target.head()

In [None]:
selected_class_label = [10]#,5,10,16,20,26,35,36,42,43]

In [None]:
selected_files = file_target[file_target['target'].isin(selected_class_label)]
selected_filename = list(selected_files['filename'])

In [None]:
src_path = '../data/noise_resources/ESC-50/audio/'
dest_path = '../data/noise_resources/background_noise/rain/'

In [None]:
for files in selected_filename:
    shutil.copy(src_path + files, dest_path + files)