In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [3]:
SAMPLE_RATE = 16000
IMAGE_DIR = "/content/drive/MyDrive/captchaDatabase/captchas/images"
AUDIO_DIR = "/content/drive/MyDrive/captchaDatabase/captchas/audio"
CSV_PATH = "/content/drive/MyDrive/captch_dataset/extracted_image_captcha_data.csv"

In [4]:
import os
from sklearn.model_selection import train_test_split

df = pd.read_csv(CSV_PATH)
audio_files = sorted([f for f in os.listdir(AUDIO_DIR) if f.endswith('.wav')])
assert len(df) == len(audio_files), "Mismatch in image and audio file counts"
df['audio_file'] = [os.path.join(AUDIO_DIR, f) for f in audio_files]
train_df, test_df = train_test_split(df, train_size=0.8, random_state=42, shuffle=False)

In [5]:
train_waveforms = np.load("/content/drive/MyDrive/captch_dataset/train_waveforms.npy", allow_pickle=True)
test_waveforms = np.load("/content/drive/MyDrive/captch_dataset/test_waveforms.npy", allow_pickle=True)

In [None]:
train_df["preprocessed_audio"] = train_waveforms
test_df["preprocessed_audio"] = test_waveforms

In [7]:
train_df = train_df[["preprocessed_audio", "image_text"]]
test_df = test_df[["preprocessed_audio", "image_text"]]

In [9]:
import numpy as np

def convert_to_float32(row):
    return np.array(row, dtype=np.float32).tolist()

for index, row in train_df.iterrows():
    train_df.at[index, "preprocessed_audio"] = convert_to_float32(row["preprocessed_audio"])

for index, row in test_df.iterrows():
    test_df.at[index, "preprocessed_audio"] = convert_to_float32(row["preprocessed_audio"])


In [10]:
train_df.head(4)
len(train_df)

8000

In [11]:
import math
import pickle
import gzip
import gc

def save_dataframe_in_chunks(df, base_filename, chunk_size=1000):

    n_chunks = math.ceil(len(df) / chunk_size)
    for i in range(n_chunks):

        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size]
        chunk_filename = f"{base_filename}_{i}.pkl.gz"
        with gzip.open(chunk_filename, "wb") as f:
            pickle.dump(chunk, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Saved chunk {i+1}/{n_chunks} to {chunk_filename}")

        del chunk
        gc.collect()

In [None]:

save_dataframe_in_chunks(train_df, "/content/drive/MyDrive/captch_dataset/train_df_chunk", chunk_size=1000)

Saved chunk 1/8 to /content/drive/MyDrive/captch_dataset/train_df_chunk_0.pkl.gz
Saved chunk 2/8 to /content/drive/MyDrive/captch_dataset/train_df_chunk_1.pkl.gz
Saved chunk 3/8 to /content/drive/MyDrive/captch_dataset/train_df_chunk_2.pkl.gz
Saved chunk 4/8 to /content/drive/MyDrive/captch_dataset/train_df_chunk_3.pkl.gz
Saved chunk 5/8 to /content/drive/MyDrive/captch_dataset/train_df_chunk_4.pkl.gz


In [18]:
import math, pickle, gzip, gc

def save_dataframe_in_chunks_with_offset(df, base_filename, chunk_size=1000, offset=0):
    n_chunks = math.ceil(len(df) / chunk_size)
    for i in range(n_chunks):
        chunk = df.iloc[i*chunk_size : (i+1)*chunk_size]
        chunk_filename = f"{base_filename}_{i + offset}.pkl.gz"
        with gzip.open(chunk_filename, "wb") as f:
            pickle.dump(chunk, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Saved chunk {i + offset + 1} to {chunk_filename}")
        del chunk
        gc.collect()

In [None]:
remaining_df = train_df.iloc[5000:]
save_dataframe_in_chunks_with_offset(remaining_df, "/content/drive/MyDrive/captch_dataset/train_df_chunk", chunk_size=1000, offset=5)

In [13]:
save_dataframe_in_chunks(test_df, "/content/drive/MyDrive/captch_dataset/test_df_chunk", chunk_size=1000)

Saved chunk 1/2 to /content/drive/MyDrive/captch_dataset/test_df_chunk_0.pkl.gz
Saved chunk 2/2 to /content/drive/MyDrive/captch_dataset/test_df_chunk_1.pkl.gz


In [19]:
remaining_df = test_df.iloc[1000:]

save_dataframe_in_chunks_with_offset(remaining_df, "/content/drive/MyDrive/captch_dataset/test_df_chunk", chunk_size=1000, offset=1)

Saved chunk 2 to /content/drive/MyDrive/captch_dataset/test_df_chunk_1.pkl.gz
