In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os

for i in range(7):
    filename = f"/content/drive/MyDrive/captch_dataset/train_df_chunk_{i}.pkl.gz"
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
    else:
        print(f"File exists: {filename}")


File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_0.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_1.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_2.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_3.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_4.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_5.pkl.gz
File exists: /content/drive/MyDrive/captch_dataset/train_df_chunk_6.pkl.gz


In [8]:
import pandas as pd
import gzip
import pickle

chunks = []

for i in range(7):
    filename = f"/content/drive/MyDrive/captch_dataset/train_df_chunk_{i}.pkl.gz"

    try:
        with gzip.open(filename, "rb") as f:
            chunk = pickle.load(f)
        chunks.append(chunk)
    except:
        continue

if chunks:
    train_df = pd.concat(chunks, ignore_index=True)
    print(f"Final DataFrame shape: {train_df.shape}")
else:
    print("No valid chunks were loaded.")


Final DataFrame shape: (5000, 2)


In [9]:
train_df.head(7)

Unnamed: 0,preprocessed_audio,image_text
0,"[0.0002045962610282004, -0.005526872351765633,...",SfPEys
1,"[0.011279435828328133, 0.001882393378764391, 0...",mTiB49
2,"[-0.0034900393802672625, -0.001152913086116314...",cFSrnk
3,"[0.0007839182508178055, -0.0003784925211220979...",t52ejf
4,"[0.0029379581101238728, -0.0030803787522017956...",X4AJ70
5,"[0.0015496743144467473, 0.0005532102659344673,...",v9hawP
6,"[-0.00043018278665840626, -5.317210525390692e-...",qglfqo


In [10]:
filename = "/content/drive/MyDrive/captch_dataset/test_df_chunk_0.pkl.gz"
with gzip.open(filename, "rb") as f:
  val_df = pickle.load(f)

In [11]:
val_df.shape

(1000, 2)

In [12]:
val_df.head(5)

Unnamed: 0,preprocessed_audio,image_text
8000,"[-0.00030997328576631844, -0.00016097107436507...",VFHTd3
8001,"[-0.0002416751958662644, -0.000104720878880470...",gevzk
8002,"[-0.0003026940394192934, 0.0002227450459031388...",TzRfoQ
8003,"[0.0043272837065160275, -0.004092774353921413,...",YjEJCb
8004,"[-0.000901086546946317, 0.0002593033423181623,...",SEWv20


In [13]:
filename = "/content/drive/MyDrive/captch_dataset/test_df_chunk_1.pkl.gz"
with gzip.open(filename, "rb") as f:
  test_df = pickle.load(f)

In [14]:
!pip install -r /content/drive/MyDrive/requirements.txt

Collecting SpeechRecognition (from -r /content/drive/MyDrive/requirements.txt (line 2))
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Collecting jiwer (from -r /content/drive/MyDrive/requirements.txt (line 3))
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting gtts (from -r /content/drive/MyDrive/requirements.txt (line 5))
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting ffmpeg (from -r /content/drive/MyDrive/requirements.txt (line 9))
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting openai-whisper (from -r /content/drive/MyDrive/requirements.txt (line 10))
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing met

In [15]:
import os
import librosa
import numpy as np
import noisereduce as nr
import pandas as pd
from sklearn.model_selection import train_test_split
import soundfile as sf
from datasets import Dataset
import evaluate
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

In [17]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [18]:
from datasets import Dataset, Features, Value, Sequence

features = Features({
    "preprocessed_audio": Sequence(Value("float32")),
    "image_text": Value("string"),
})

train_dataset = Dataset.from_pandas(train_df, features=features)
eval_dataset = Dataset.from_pandas(val_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)

In [19]:
train_df_view = train_dataset.to_pandas()
print(train_df_view.head())

                                  preprocessed_audio image_text
0  [0.00020459626, -0.0055268724, -0.0014617653, ...     SfPEys
1  [0.011279436, 0.0018823934, 0.0075927735, -0.0...     mTiB49
2  [-0.0034900394, -0.0011529131, -0.0022350624, ...     cFSrnk
3  [0.00078391825, -0.00037849252, -0.001753441, ...     t52ejf
4  [0.002937958, -0.0030803788, -0.0034206489, 0....     X4AJ70


In [20]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def prepare_dataset(batch):

    audio_inputs = processor.feature_extractor(
        batch["preprocessed_audio"],
        sampling_rate=16000,
        return_tensors="np",
        padding=True
    )
    batch["input_values"] = audio_inputs["input_values"]

    if "attention_mask" in audio_inputs:
        batch["attention_mask"] = audio_inputs["attention_mask"]
    else:
        batch["attention_mask"] = np.ones_like(audio_inputs["input_values"])

    texts = batch["image_text"]
    if texts is None:
        texts = [""]
    else:
        texts = [text if text is not None else "" for text in texts]

    labels = [processor.tokenizer.encode(text) for text in texts]
    batch["labels"] = labels
    return batch


In [22]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
eval_dataset = eval_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
def replace_padding_with_negone(batch):
    batch["labels"] = [
        [-100 if token == processor.tokenizer.pad_token_id else token for token in label]
        for label in batch["labels"]
    ]
    return batch

In [24]:
train_dataset = train_dataset.map(replace_padding_with_negone)
eval_dataset = eval_dataset.map(replace_padding_with_negone)
test_dataset = test_dataset.map(replace_padding_with_negone)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [99]:
print(test_dataset[0])

{'input_values': [[0.0052760690450668335, 0.007894177921116352, 0.00529860844835639, 0.0017455621855333447, 0.007303947117179632, 0.008825849741697311, 0.010565494187176228, 0.008944972418248653, 0.00845482386648655, 0.01096970122307539, 0.01307472214102745, 0.013642343692481518, 0.012859311886131763, 0.012918015941977501, 0.014519832096993923, 0.015679575502872467, 0.015405334532260895, 0.016559625044465065, 0.016932683065533638, 0.01684693619608879, 0.017011981457471848, 0.017623236402869225, 0.019068391993641853, 0.018692370504140854, 0.018614524975419044, 0.01873549446463585, 0.019489828497171402, 0.019607985392212868, 0.019888684153556824, 0.020070932805538177, 0.019818197935819626, 0.020225470885634422, 0.020242739468812943, 0.020600033923983574, 0.020362697541713715, 0.020560171455144882, 0.020834296941757202, 0.02033860981464386, 0.020565848797559738, 0.020902492105960846, 0.021026384085416794, 0.020132651552557945, 0.020449843257665634, 0.02107151225209236, 0.02071308903396129

In [38]:
print(train_dataset[0])

{'input_values': [[0.011365738697350025, -0.049618903547525406, -0.006364875473082066, 0.007962258532643318, 0.03551606461405754, -0.026222221553325653, 0.06354667991399765, -0.019589385017752647, 0.04381011053919792, 0.03334162384271622, 0.08380161970853806, 0.06200313940644264, -0.03899252414703369, -0.001037313137203455, 0.12900960445404053, 0.013148248195648193, 0.016608651727437973, 0.10316601395606995, 0.07313451915979385, 0.05513639375567436, 0.009908941574394703, 0.04861927404999733, 0.037125736474990845, 0.027697252109646797, 0.011592185124754906, 0.02768830768764019, -0.017753107473254204, -0.0357624851167202, 0.06382443010807037, 0.0765814334154129, 0.02336842566728592, 0.005881049204617739, 0.06661846488714218, -0.0011297928867861629, 0.028236307203769684, -0.02673535794019699, 0.005499477963894606, 0.054255496710538864, 0.03882378712296486, 0.1348838061094284, -0.01883561909198761, 0.06867292523384094, -0.062458235770463943, -0.08883325010538101, -0.013900971040129662, 0.0

In [26]:
import evaluate
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:
processor.tokenizer.pad_token = processor.tokenizer.eos_token

In [50]:
print(processor.feature_extractor(train_dataset[0]['input_values']))

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


{'input_values': [array([ 0.0113658 , -0.04961918, -0.00636492, ...,  0.03539024,
       -0.03493632, -0.05654331], dtype=float32)]}


In [None]:
import torch
import torch.nn.utils.rnn as rnn_utils
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

@dataclass
class CustomDataCollatorCTCWithPadding:
    processor: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:

        input_values = [feature["input_values"][0] for feature in features]

        padded_inputs = self.processor.feature_extractor.pad(
            {"input_values": input_values}, padding=self.padding, return_tensors="pt"
        )["input_values"]

        padded_labels = None
        if "labels" in features[0]:
            label_tensors = []
            for feature in features:
                flat_labels = [item for sublist in feature["labels"] for item in sublist]
                label_tensors.append(torch.tensor(flat_labels, dtype=torch.long))

            padded_labels = rnn_utils.pad_sequence(label_tensors, batch_first=True, padding_value=-100)

        batch = {"input_values": padded_inputs}
        if padded_labels is not None:
            batch["labels"] = padded_labels

        return batch

In [None]:
data_collator = CustomDataCollatorCTCWithPadding(processor=processor)

training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-captcha",
    per_device_train_batch_size=4,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=10,
    fp16=True,
    learning_rate=1e-4,
    weight_decay=0.005,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    logging_steps=100,
)

In [None]:
class CTCTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        labels = inputs.get("labels")
        target_lengths = (labels != -100).sum(dim=-1)

        outputs = model(**inputs)
        logits = outputs.logits
        log_probs = logits.log_softmax(dim=-1).transpose(0, 1)

        batch_size = logits.shape[0]
        input_length = log_probs.shape[0]
        input_lengths = torch.full(
            (batch_size,), input_length, dtype=torch.long, device=log_probs.device
        )

        labels_flat = labels[labels != -100]
        blank_token_id = getattr(model.config, "ctc_blank_token_id", self.tokenizer.pad_token_id)

        loss = torch.nn.functional.ctc_loss(
            log_probs,
            labels_flat,
            input_lengths,
            target_lengths,
            blank=blank_token_id,
            reduction="mean",
            zero_infinity=model.config.ctc_zero_infinity,
        )

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CTCTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [85]:
trainer.train()

  trainer = CTCTrainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Step,Training Loss,Validation Loss,Wer
500,17.5824,2.525609,1.0
1000,6.3603,2.9169,1.0
1500,4.6701,2.833299,1.0
2000,2.8393,2.755729,1.0


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=2000, training_loss=18.770116455078124, metrics={'train_runtime': 1472.5133, 'train_samples_per_second': 33.956, 'train_steps_per_second': 8.489, 'total_flos': 5.399668544304499e+17, 'train_loss': 18.770116455078124, 'epoch': 1.6})

In [86]:
predictions = trainer.predict(test_dataset)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [87]:
import os

save_directory = "./wav2vec2_model"
model.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

print(f"Model and processor saved to {save_directory}")

Model and processor saved to ./wav2vec2_model


In [88]:
import torch

logits = predictions.predictions
predicted_ids = torch.argmax(torch.tensor(logits), dim=-1)

transcriptions = processor.batch_decode(predicted_ids)

In [92]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

processor = Wav2Vec2Processor.from_pretrained("/content/wav2vec2_model")
model = Wav2Vec2ForCTC.from_pretrained("/content/wav2vec2_model")

model.eval()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder)

In [98]:
test_results = trainer.predict(test_dataset)
print(test_results.metrics)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'test_loss': 2.5418710708618164, 'test_wer': 1.0, 'test_runtime': 100.2366, 'test_samples_per_second': 9.976, 'test_steps_per_second': 1.247}
