In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/indic-tts-deepfake-challenge/sample.csv


In [2]:
!pip install evaluate

import torch
from transformers import Trainer, TrainingArguments, AutoFeatureExtractor, AutoModelForAudioClassification
from datasets import load_dataset
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import pandas as pd
import numpy as np
from scipy.special import softmax
from tqdm import tqdm

# Load dataset
dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")
train_dataset = dataset["train"].shuffle(seed=42)
test_dataset = dataset["test"]

# Load Model & Feature Extractor
model_id = "ntu-spml/distilhubert"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)
model = AutoModelForAudioClassification.from_pretrained(model_id, num_labels=2)  # Binary classification

# Get unique languages
unique_languages = set(train_dataset["language"])

# Get 1/3rd of dataset while maintaining language balance
sampled_indices = []
fraction = 0.33

for lang in unique_languages:
    lang_indices = [i for i, lang_val in enumerate(train_dataset["language"]) if lang_val == lang]
    sample_size = max(1, int(len(lang_indices) * fraction))  # Ensure at least 1 sample per language
    sampled_indices.extend(lang_indices[:sample_size])

# Create new dataset with sampled indices
sampled_dataset = train_dataset.select(sampled_indices)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def prepare_sample(batch):
    audio = batch["audio"]["array"]
    expected_length = 32000  # 2 sec of audio (32,000 samples)

    # Trim or pad to 2 sec
    if len(audio) < expected_length:
        audio = np.pad(audio, (0, expected_length - len(audio)), mode='constant')
    else:
        audio = audio[:expected_length]

    # Extract features using feature_extractor
    inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]  

    # Convert label to tensor
    batch["labels"] = torch.tensor(batch["is_tts"], dtype=torch.float)

    return batch

# Apply preprocessing
sampled_dataset = sampled_dataset.map(prepare_sample, remove_columns=["audio", "text", "id", "language", "is_tts"])

# Split dataset
dataset1 = sampled_dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)

from transformers import DataCollatorWithPadding
from evaluate import load

@dataclass
class DataCollatorWithPadding:
    processor: AutoFeatureExtractor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)

        return batch

data_collator = DataCollatorWithPadding(feature_extractor, padding=True)

# Load evaluation metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")
roc_auc_metric = load("roc_auc")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_probs = softmax(pred_logits, axis=-1)[:, 1]  # Probability of class 1 (synthetic speech)
    labels = pred.label_ids

    accuracy = accuracy_metric.compute(predictions=pred_probs.round(), references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=pred_probs.round(), references=labels, average="binary")["precision"]
    recall = recall_metric.compute(predictions=pred_probs.round(), references=labels, average="binary")["recall"]
    f1 = f1_metric.compute(predictions=pred_probs.round(), references=labels, average="binary")["f1"]
    
    roc_auc = roc_auc_metric.compute(prediction_scores=pred_probs, references=labels)["roc_auc"]


    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


README.md:   0%|          | 0.00/2.81k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/35 [00:00<?, ?files/s]

train-00000-of-00035.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00001-of-00035.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00002-of-00035.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00003-of-00035.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00004-of-00035.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

train-00005-of-00035.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

train-00006-of-00035.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00007-of-00035.parquet:   0%|          | 0.00/516M [00:00<?, ?B/s]

train-00008-of-00035.parquet:   0%|          | 0.00/557M [00:00<?, ?B/s]

train-00009-of-00035.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

train-00010-of-00035.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00011-of-00035.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00012-of-00035.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00013-of-00035.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00014-of-00035.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00015-of-00035.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

train-00016-of-00035.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00017-of-00035.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00018-of-00035.parquet:   0%|          | 0.00/471M [00:00<?, ?B/s]

train-00019-of-00035.parquet:   0%|          | 0.00/501M [00:00<?, ?B/s]

train-00020-of-00035.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

train-00021-of-00035.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

train-00022-of-00035.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00023-of-00035.parquet:   0%|          | 0.00/599M [00:00<?, ?B/s]

train-00024-of-00035.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

train-00025-of-00035.parquet:   0%|          | 0.00/547M [00:00<?, ?B/s]

train-00026-of-00035.parquet:   0%|          | 0.00/537M [00:00<?, ?B/s]

train-00027-of-00035.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

train-00028-of-00035.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00029-of-00035.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00030-of-00035.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00031-of-00035.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00032-of-00035.parquet:   0%|          | 0.00/613M [00:00<?, ?B/s]

train-00033-of-00035.parquet:   0%|          | 0.00/309M [00:00<?, ?B/s]

train-00034-of-00035.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/356M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/364M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31102 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2635 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.0M [00:00<?, ?B/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10261 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [3]:

# Freeze feature extractor
for param in model.hubert.parameters():  # Freeze the feature encoder
    param.requires_grad = False



training_args = TrainingArguments(
    output_dir="AsrTaskModel",
    group_by_length=True,
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,  # Keep evaluation batch size same
    evaluation_strategy="steps",
    metric_for_best_model="roc_auc",  # Optimizing for ROC-AUC instead of accuracy
    num_train_epochs=25,  # Set to 10 as per your preference
    fp16=True,  # Mixed precision training for speed
    gradient_checkpointing=True,  # Save memory
    save_steps=1000,
    eval_steps=500,
    logging_steps=500,
    learning_rate=3e-5,  # Lower LR to prevent instability
    weight_decay=0.01,  # Increased weight decay to reduce overfitting
    warmup_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="steps",
    report_to="none"
)



# Define trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset1["train"],
    eval_dataset=dataset1["test"],
    tokenizer=feature_extractor,
)

trainer.train()

# Inference & Submission


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
500,0.6908,0.682522,0.665044,0.602136,0.907445,0.723917,0.808124
1000,0.6616,0.630097,0.774099,0.838875,0.65996,0.738739,0.847948
1500,0.5854,0.544489,0.788705,0.816742,0.726358,0.768903,0.860074
2000,0.5055,0.482551,0.806232,0.858173,0.71831,0.782037,0.88226
2500,0.4494,0.439026,0.819864,0.867925,0.740443,0.799131,0.898831
3000,0.407,0.399171,0.839338,0.862445,0.794769,0.827225,0.91353
3500,0.3755,0.369547,0.85297,0.863445,0.826962,0.84481,0.924384
4000,0.3524,0.348274,0.864654,0.862348,0.857143,0.859738,0.931571
4500,0.3339,0.331893,0.867575,0.89154,0.826962,0.858038,0.93829
5000,0.3301,0.317289,0.872444,0.857422,0.8833,0.870168,0.942265


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=14450, training_loss=0.3395753178052011, metrics={'train_runtime': 6136.3644, 'train_samples_per_second': 37.62, 'train_steps_per_second': 2.355, 'total_flos': 1.0499876206272e+18, 'train_loss': 0.3395753178052011, 'epoch': 25.0})

In [4]:
model.eval()
submission_results = []

for sample in test_dataset:
    sample_id = sample["id"]
    audio_input = sample["audio"]["array"]

    inputs = feature_extractor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    probabilities = softmax(logits.cpu().numpy(), axis=-1)
    is_tts_prob = round(probabilities[0, 1], 3)

    submission_results.append([sample_id, is_tts_prob])

submission_df = pd.DataFrame(submission_results, columns=["id", "is_tts"])
submission_df.to_csv("./submission.csv", index=False)

print("✅ Submission file saved correctly with original 'id' values!")


✅ Submission file saved correctly with original 'id' values!
