In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb logs


In [2]:
!pip install -q transformers datasets accelerate


In [7]:
import pandas as pd

df = pd.read_csv('/kaggle/input/final-18k-dataset/cleaned_shuffled_merged_dataset.csv')  # change path if needed
print(df.head())



                     filename  \
0         audio_668_aug29.wav   
1        audio_478_aug677.wav   
2         audio_947_aug60.wav   
3  audio_942_aug46_aug943.wav   
4       audio_445_aug1209.wav   

                                       transcription  label  
0  there follow so many people in the lane like v...    2.5  
1  i will pass through your house. i will let him...    4.5  
2  from a fan created very huge in circular area ...    2.0  
3  a playground looks above us clear and neat as ...    1.5  
4  my family playground were not really big. it w...    3.0  


In [8]:
df["label"].unique()

array([2.5, 4.5, 2. , 1.5, 3. , 4. , 3.5, 5. , 1. ])

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load CSV (update path as needed)


# Check and clean
df = df[['transcription', 'label']].dropna()

# Convert float labels to string to preserve classes like '2.5', '3.5', etc.
df['label'] = df['label'].astype(str)

# Encode labels (e.g., '1.0' -> 0, '1.5' -> 1, ..., '5.0' -> 8)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

# Preview
print(df.head())
print("Label classes:", list(le.classes_))  # You can store this for decoding later


                                       transcription  label
0  it are so in all estimated. presentat gasoline...      6
1  The Del mercad trotzdem впrago Chshh Ill leave...      8
2  i value botany, i continuously gain immense of...      0
3  my favorite place is to visit mecca and medina...      5
4  Im in a public market in the Philippines and a...      7
Label classes: ['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0']


In [6]:
from datasets import Dataset

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split into train/test
dataset = dataset.train_test_split(test_size=0.01, seed=42)

# Optional: inspect a sample
print(dataset['train'][0])


{'transcription': 'hobbies are work which a person does no interest gets much satisfaction and amusement. it is a kind of recreation, shatter from the searching rays of the sun, and also get fruit from it. there are many kinds of hobbies and we select one of them here per our will and mindset. some our hobby of collecting post - all, also collecting interesting book of stories, some pet birds but my hobby is gardening. i am much interested in gardening since my childhood. i like to see the flowers, velvary grass, different color flowers and beautiful plants. so i have selected long piece of land in my house and planted different kind of trees. i further planted red rose, yellow and black rose plants which have arranged them one after another.', 'label': 5, '__index_level_0__': 353}


In [8]:
from transformers import AutoTokenizer

# Load DeBERTa v3 Large tokenizer
checkpoint = "/kaggle/input/deberta-v3-large-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenization function
def tokenize_fn(example):
    return tokenizer(example["transcription"], truncation=True, padding="max_length", max_length=512)

# Apply to dataset
tokenized_dataset = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/884 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=9
)


# dont use these cell for finetuning


In [8]:
model.save_pretrained("./deberta-v3-large-base")
tokenizer.save_pretrained("./deberta-v3-large-base")

('./deberta-v3-large-base/tokenizer_config.json',
 './deberta-v3-large-base/special_tokens_map.json',
 './deberta-v3-large-base/spm.model',
 './deberta-v3-large-base/added_tokens.json',
 './deberta-v3-large-base/tokenizer.json')

In [13]:
import os
import json

dataset_name = "deberta-v3-large-base"
kaggle_username = "avinash1tiwari"  # 🔁 Replace this manually!

# Metadata content
metadata = {
    "title": dataset_name,
    "id": f"{kaggle_username}/{dataset_name}",
    "licenses": [{"name": "CC0-1.0"}]
}

# Save metadata file
with open(os.path.join(dataset_name, "dataset-metadata.json"), "w") as f:
    json.dump(metadata, f)


In [15]:
!mkdir -p ~/.kaggle
with open("/kaggle/input/kaggle-api/kaggle.json", "r") as src:
    with open("/root/.kaggle/kaggle.json", "w") as dst:
        dst.write(src.read())
!chmod 600 /root/.kaggle/kaggle.json


In [16]:
!kaggle datasets create -p deberta-v3-large-base --dir-mode zip


Starting upload for file config.json
100%|██████████████████████████████████████| 1.25k/1.25k [00:00<00:00, 1.39kB/s]
Upload successful: config.json (1KB)
Starting upload for file added_tokens.json
100%|█████████████████████████████████████████| 23.0/23.0 [00:01<00:00, 13.8B/s]
Upload successful: added_tokens.json (23B)
Starting upload for file special_tokens_map.json
100%|████████████████████████████████████████████| 286/286 [00:00<00:00, 330B/s]
Upload successful: special_tokens_map.json (286B)
Starting upload for file tokenizer_config.json
100%|████████████████████████████████████████| 1.28k/1.28k [00:01<00:00, 779B/s]
Upload successful: tokenizer_config.json (1KB)
Starting upload for file tokenizer.json
100%|██████████████████████████████████████| 8.26M/8.26M [00:03<00:00, 2.71MB/s]
Upload successful: tokenizer.json (8MB)
Starting upload for file spm.model
100%|███████████████████████████████████████| 2.35M/2.35M [00:02<00:00, 833kB/s]
Upload successful: spm.model (2MB)
Starting up

# start finetuning code from here


In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [11]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./deberta-v3-large-finetuned(700-5epochs)",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # Simulates effective batch size of 4
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # or "f1", depending on what compute_metrics returns
    save_total_limit=1,
    push_to_hub=False,
    fp16=True  # Most important memory-saving flag
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
import torch
torch.cuda.empty_cache()

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)




  trainer = Trainer(


In [17]:
df.shape

(893, 2)

In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1909,1.613279,0.222222
2,0.9669,1.413299,0.222222
3,0.7976,1.427943,0.444444
4,0.3873,1.336622,0.444444
5,0.2104,1.775167,0.333333


TrainOutput(global_step=1105, training_loss=0.9040224822937634, metrics={'train_runtime': 1541.6539, 'train_samples_per_second': 2.867, 'train_steps_per_second': 0.717, 'total_flos': 4119261858017280.0, 'train_loss': 0.9040224822937634, 'epoch': 5.0})

In [22]:
model.save_pretrained("/kaggle/working/deberta-v3-large-finetuned-700-5epochs")
tokenizer.save_pretrained("/kaggle/working/deberta-v3-large-finetuned-700-5epochs")


('/kaggle/working/deberta-v3-large-finetuned-700-5epochs/tokenizer_config.json',
 '/kaggle/working/deberta-v3-large-finetuned-700-5epochs/special_tokens_map.json',
 '/kaggle/working/deberta-v3-large-finetuned-700-5epochs/spm.model',
 '/kaggle/working/deberta-v3-large-finetuned-700-5epochs/added_tokens.json',
 '/kaggle/working/deberta-v3-large-finetuned-700-5epochs/tokenizer.json')

# save the finetuned data just by running this cell


In [23]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import json

# Set Kaggle credentials as environment variables
os.environ['KAGGLE_USERNAME'] = "avinash1tiwari"
os.environ['KAGGLE_KEY'] = "21a0b93c002942f2e649b292681fc4cf"

# Set the directory and dataset name (renaming folder to avoid parentheses)
model_dir = "/kaggle/working/deberta-v3-large-finetuned-700-5epochs"  # renamed folder name
dataset_name = "deberta-v3-large-finetuned-700-5epochs"  # renamed dataset name
dataset_id = f"avinash1tiwari/{dataset_name}"

# Create metadata file in the directory
metadata = {
    "title": dataset_name,
    "id": dataset_id,
    "licenses": [{"name": "CC0-1.0"}]
}

# Save the metadata to a JSON file
with open(os.path.join(model_dir, "dataset-metadata.json"), "w") as f:
    json.dump(metadata, f)

# Authenticate with Kaggle API
api = KaggleApi()
api.authenticate()

# Create a new dataset (this just creates the dataset on Kaggle)
api.dataset_create_new(folder=model_dir, public=False)

# Now upload the model directory with subdirectories (use kaggle CLI command)
# Run the shell command to upload the folder using Kaggle's CLI tool
!kaggle datasets create -p {model_dir} --dir-mode

print("✅ Model uploaded successfully to Kaggle Datasets.")


Starting upload for file added_tokens.json


100%|██████████| 23.0/23.0 [00:00<00:00, 26.7B/s]


Upload successful: added_tokens.json (23B)
Starting upload for file special_tokens_map.json


100%|██████████| 970/970 [00:00<00:00, 1.14kB/s]


Upload successful: special_tokens_map.json (970B)
Starting upload for file config.json


100%|██████████| 1.26k/1.26k [00:00<00:00, 1.42kB/s]


Upload successful: config.json (1KB)
Starting upload for file spm.model


100%|██████████| 2.35M/2.35M [00:02<00:00, 1.17MB/s]


Upload successful: spm.model (2MB)
Starting upload for file model.safetensors


100%|██████████| 1.62G/1.62G [01:04<00:00, 27.0MB/s] 


Upload successful: model.safetensors (2GB)
Starting upload for file tokenizer_config.json


100%|██████████| 1.47k/1.47k [00:00<00:00, 1.76kB/s]


Upload successful: tokenizer_config.json (1KB)
Starting upload for file tokenizer.json


100%|██████████| 8.25M/8.25M [00:02<00:00, 3.68MB/s]


Upload successful: tokenizer.json (8MB)
usage: kaggle datasets create [-h] [-p FOLDER] [-u] [-q] [-t] [-r {skip,zip,tar}]
kaggle datasets create: error: argument -r/--dir-mode: expected one argument
✅ Model uploaded successfully to Kaggle Datasets.


In [30]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params/1e6:.2f}M")
print(f"Trainable Parameters: {trainable_params/1e6:.2f}M")

Total Parameters: 435.07M
Trainable Parameters: 435.07M


In [None]:
df=pd.read_csv("/kaggle/input/shl-dataset/transcribed.csv")

In [None]:
import pandas as pd

print("🔹 First 10 rows of the dataset:")
print(df.head(10))

print("\n🔹 Dataset info:")
print(df.info())

print("\n🔹 Missing values per column:")
print(df.isnull().sum())

print("\n🔹 Label value counts (encoded):")
print(df['label'].value_counts().sort_index())

print("\n🔹 Unique labels (encoded):", df['label'].nunique())
print("🔹 Total rows:", len(df))


In [20]:
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np

# Get predictions on train set
train_predictions = trainer.predict(tokenized_dataset["train"])
y_pred = np.argmax(train_predictions.predictions, axis=1)
y_true = train_predictions.label_ids


In [19]:
corr, _ = pearsonr(y_true, y_pred)
print(f"📊 Pearson Correlation Coefficient (Train Set): {corr:.4f}")


📊 Pearson Correlation Coefficient (Train Set): 0.9408


In [None]:
output_dir = "/kaggle/working/deberta-v3-large-finetuned"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
import os

output_dir = "/kaggle/working/deberta-v3-large-finetuned"
print("Files inside:", os.listdir(output_dir))

# Check file sizes
for f in os.listdir(output_dir):
    path = os.path.join(output_dir, f)
    print(f"{f} — {round(os.path.getsize(path)/1024/1024, 2)} MB")
