In [None]:
!pip install nlpaug
!pip install transformers
!pip install torch torchvision torchaudio


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)


In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
from collections import Counter


In [None]:
df = pd.read_csv("/content/transcribed.csv")  # replace with your filename
df.head()

Unnamed: 0,filename,transcription,label
0,audio_90.wav,A situation in which there is a lot of compet...,3.0
1,audio_581.wav,"All right, I would say my goal, which I would...",3.0
2,audio_77.wav,All right. A crowded market is where lots of ...,2.5
3,audio_817.wav,My favorite hobby is playing cricket. I like ...,2.0
4,audio_694.wav,It is where a lot of money has come in to sto...,5.0


In [None]:
print(df['label'].value_counts().sort_index())


label
1.0      1
1.5      3
2.0     70
2.5     40
3.0     87
3.5     23
4.0     52
4.5     58
5.0    110
Name: count, dtype: int64


In [None]:
import nlpaug.augmenter.word as naw

# Use GPU if available
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    device='cpu'  # or 'cpu' if no GPU
)


In [None]:
# Drop rows with missing transcription or label
df = df.dropna(subset=['transcription', 'label'])

# Convert transcription column to string type
df['transcription'] = df['transcription'].astype(str)


In [None]:
from collections import defaultdict

target_count = 200 # Desired samples per label
augmented_rows = []

label_counts = df['label'].value_counts()

for label, count in label_counts.items():
    if count >= target_count:
        continue

    needed = target_count - count
    reps = (needed // count) + 1

    subset = df[df['label'] == label]

    new_id_counter = 0
    for _, row in subset.iterrows():
        for _ in range(reps):
            try:
                aug_text = aug.augment(row['transcription'])
                new_filename = f"{row['filename'].split('.')[0]}_aug{new_id_counter}.wav"
                new_id_counter += 1
                augmented_rows.append({
                    'filename': new_filename,
                    'transcription': aug_text,
                    'label': row['label']
                })
                if len([r for r in augmented_rows if r['label'] == label]) >= needed:
                    break
            except Exception as e:
                print(f"Augmentation error: {e}")


In [None]:
aug_df = pd.DataFrame(augmented_rows)
df_final = pd.concat([df, aug_df]).sample(frac=1).reset_index(drop=True)

print(df_final['label'].value_counts())


label
2.0    138
4.5    116
5.0    109
4.0    104
2.5     98
3.0     87
3.5     82
1.0     80
1.5     80
Name: count, dtype: int64
