In [19]:
import soundfile as sf
import audiomentations as A
from pathlib import Path

def augment_audio(input_path):
    # Define augmentations
    augmentations = A.Compose([
        A.AddGaussianNoise(max_amplitude=0.01, p=1),
        A.ClippingDistortion(p=1),
        A.Gain(p=1),
        A.PeakingFilter(p=1),
        A.Mp3Compression(p=1),
    ])
    
    # Load audio file
    audio, sr = sf.read(input_path)
    
    # Apply augmentations
    augmented_audio = augmentations(samples=audio, sample_rate=sr)
    
    # Save augmented audio
    output_path = Path(input_path).with_name(f"{Path(input_path).stem}-aug.wav")
    sf.write(output_path, augmented_audio, samplerate=sr)
    
    return output_path

# Example usage
input_path = "../TIMIT/TRAIN/DR1/FCJF0/SA1.WAV"
output_path = augment_audio(input_path)
print(f"Augmented audio saved to {output_path}")


Augmented audio saved to ../TIMIT/TRAIN/DR1/FCJF0/SA1-aug.wav




In [29]:
import whisper

model = whisper.load_model('large-v3')

In [30]:
result = model.transcribe(str(output_path))
result

{'text': ' He has the best suit and he can walk around all year.',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 2.92,
   'text': ' He has the best suit and he can walk around all year.',
   'tokens': [50365,
    634,
    575,
    264,
    1151,
    5722,
    293,
    415,
    393,
    1792,
    926,
    439,
    1064,
    13,
    50511],
   'temperature': 0.0,
   'avg_logprob': -0.44004854559898376,
   'compression_ratio': 0.9137931034482759,
   'no_speech_prob': 0.035187095403671265}],
 'language': 'en'}

In [31]:
result = model.transcribe(str(input_path))
result

{'text': ' She had your duck suit and greasy wash water all year.',
 'segments': [{'id': 0,
   'seek': 0,
   'start': 0.0,
   'end': 2.7800000000000002,
   'text': ' She had your duck suit and greasy wash water all year.',
   'tokens': [50365,
    1240,
    632,
    428,
    12482,
    5722,
    293,
    36401,
    5675,
    1281,
    439,
    1064,
    13,
    50504],
   'temperature': 0.0,
   'avg_logprob': -0.2915722211201986,
   'compression_ratio': 0.9,
   'no_speech_prob': 0.008044268004596233}],
 'language': 'en'}