In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
requirements_path = '/content/drive/MyDrive/requirements.txt'

with open(requirements_path, 'w') as file:
    file.write('librosa\nSpeechRecognition\njiwer\nmatplotlib\ngtts\ntorch\ntorchvision\ntorchaudio\nffmpeg\nopenai-whisper\ntensor\nnoisereduce\nscikit-learn\ntransformers\ndatasets\nevaluate')

In [None]:
!pip install -r /content/drive/MyDrive/requirements.txt

Collecting SpeechRecognition (from -r /content/drive/MyDrive/requirements.txt (line 2))
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Collecting jiwer (from -r /content/drive/MyDrive/requirements.txt (line 3))
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting gtts (from -r /content/drive/MyDrive/requirements.txt (line 5))
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting ffmpeg (from -r /content/drive/MyDrive/requirements.txt (line 9))
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting openai-whisper (from -r /content/drive/MyDrive/requirements.txt (line 10))
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing met

In [None]:
import os
import librosa
import numpy as np
import noisereduce as nr
import pandas as pd
from sklearn.model_selection import train_test_split
import soundfile as sf
from datasets import Dataset
import evaluate
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [None]:
SAMPLE_RATE = 16000
IMAGE_DIR = "/content/drive/MyDrive/captchaDatabase/captchas/images"
AUDIO_DIR = "/content/drive/MyDrive/captchaDatabase/captchas/audio"
CSV_PATH = "/content/drive/MyDrive/captch_dataset/extracted_image_captcha_data.csv"

In [None]:
df = pd.read_csv(CSV_PATH)

In [None]:
df.head(5)

Unnamed: 0,image_file,image_text
0,captcha_9023.png,SfPEys
1,captcha_9024.png,mTiB49
2,captcha_9025.png,cFSrnk
3,captcha_9026.png,t52ejf
4,captcha_9027.png,X4AJ70


In [None]:
audio_files = sorted([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
assert len(df) == len(audio_files),
df['audio_file'] = [os.path.join(AUDIO_DIR, f) for f in audio_files]

In [None]:
train_df, test_df = train_test_split(df, train_size=0.8, random_state=42, shuffle=False)

In [None]:
import torchaudio
from joblib import Parallel, delayed

SAMPLE_RATE = 16000

def preprocess_audio(audio_file, apply_augmentation=True):

    waveform, sample_rate = torchaudio.load(audio_file)

    if sample_rate != SAMPLE_RATE:
        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=SAMPLE_RATE)(waveform)

    waveform = waveform.numpy().flatten()

    noise_estimate = waveform[:int(0.5 * SAMPLE_RATE)]
    waveform = nr.reduce_noise(y=waveform, sr=SAMPLE_RATE, y_noise=noise_estimate)
    waveform, _ = librosa.effects.trim(waveform, top_db=min(30, np.max(waveform) * 100))
    waveform = waveform / (np.max(np.abs(waveform)) + 1e-7)

    if apply_augmentation:
        aug_type = np.random.choice(['noise', 'stretch', 'pitch'])
        if aug_type == 'noise':
            waveform += np.random.normal(0, 0.005, waveform.shape)
        elif aug_type == 'stretch':
            waveform = librosa.effects.time_stretch(waveform.astype(np.float32), rate=np.random.uniform(0.9, 1.1))
        elif aug_type == 'pitch':
            waveform = librosa.effects.pitch_shift(waveform.astype(np.float32), sr=SAMPLE_RATE, n_steps=np.random.randint(-2, 2))

    return waveform

In [None]:
train_waveforms = Parallel(n_jobs=-1)(delayed(preprocess_audio)(audio, True) for audio in train_df['audio_file'])
test_waveforms = Parallel(n_jobs=-1)(delayed(preprocess_audio)(audio, False) for audio in test_df['audio_file'])

In [None]:
import numpy as np

np.save("/content/drive/MyDrive/captch_dataset/train_waveforms.npy", np.array(train_waveforms, dtype=object))
np.save("/content/drive/MyDrive/captch_dataset/test_waveforms.npy", np.array(test_waveforms, dtype=object))