In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  Install & Import Dependencies

In [None]:
!pip install transformers datasets librosa torch  scikit-learn tqdm evaluate

In [None]:
from datasets import load_dataset
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
import torch
import numpy as np
import pandas as pd
from scipy.special import softmax
from tqdm import tqdm
from transformers import TrainingArguments, Trainer
from evaluate import load
from dataclasses import dataclass
from typing import List, Dict, Union

# Loading the Dataset

In [None]:
# Load dataset
speech_dataset = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")
train_samples = speech_dataset["train"]
test_samples = speech_dataset["test"]

# Loading the Pre-trained Model

In [None]:
model_name = "facebook/wav2vec2-large-xlsr-53"
extractor = AutoFeatureExtractor.from_pretrained(model_name)

audio_model = AutoModelForAudioClassification.from_pretrained(model_name, num_labels=2)


# Creating a Balanced Subset of the Dataset

In [None]:
# Get unique language identifiers
language_set = set(train_samples["language"])

# Sample 1/4 of the dataset while preserving language distribution
subset_indices = []
portion = 0.25

for lang in language_set:
    indices = [idx for idx, val in enumerate(train_samples["language"]) if val == lang]
    shuffled_data = train_samples.select(indices).shuffle(seed=42)
    num_samples = max(1, int(len(indices) * portion))
    subset_indices.extend(indices[:num_samples])

# Create a sampled dataset
filtered_dataset = train_samples.select(subset_indices)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing the Audio

In [None]:
def process_audio(sample):
    signal = sample["audio"]["array"]
    target_length = 32000

    # Adjust audio length
    if len(signal) < target_length:
        signal = np.pad(signal, (0, target_length - len(signal)), mode='constant')
    else:
        signal = signal[:target_length]

    # Extract features
    processed_input = extractor(signal, sampling_rate=16000, return_tensors="pt", padding=True)
    sample["input_values"] = processed_input.input_values[0]
    sample["labels"] = torch.tensor(sample["is_tts"], dtype=torch.float)
    return sample

In [None]:
# Apply transformation
filtered_dataset = filtered_dataset.map(process_audio, remove_columns=["audio", "text", "id", "language", "is_tts"])


# Splitting dataset for training and evaluation

In [None]:
# Split dataset for training and evaluation
data_splits = filtered_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)


# Creating Custom Data Collator

In [None]:
@dataclass
class CustomCollator:
    extractor: extractor
    padding: Union[bool, str] = True

    def __call__(self, batch: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        inputs = [{"input_values": sample["input_values"]} for sample in batch]
        padded_batch = self.extractor.pad(inputs, padding=self.padding, return_tensors="pt")
        padded_batch["labels"] = torch.tensor([sample["labels"] for sample in batch], dtype=torch.long)
        return padded_batch

collator = CustomCollator(extractor, padding=True)

# Defining Evaluation Metric

In [None]:
# Define evaluation metrics

def evaluate_metrics(predictions):
    pred_logits = predictions.predictions
    pred_probs = softmax(pred_logits, axis=-1)[:, 1]
    true_labels = predictions.label_ids

    return {
        "roc_auc": load("roc_auc").compute(prediction_scores=pred_probs, references=true_labels)["roc_auc"]
    }

#  Defining Training Arguments

In [None]:
training_config = TrainingArguments(
    output_dir="/output",
    group_by_length=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    num_train_epochs=10,
    fp16=True,
    gradient_checkpointing=True,
    save_steps=500,
    eval_steps=250,
    logging_steps=250,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none"
)


# Initializing the Trainer

In [None]:
# Initialize trainer
trainer_instance = Trainer(
    model=audio_model,
    data_collator=collator,
    args=training_config,
    compute_metrics=evaluate_metrics,
    train_dataset=data_splits["train"],
    eval_dataset=data_splits["test"],
    tokenizer=extractor
)


# Training the Model

In [None]:
trainer_instance.train()

# Evaluating Model Performance on Test Data

In [None]:
# Inference and submission
submission_list = []
audio_model.eval()
for entry in tqdm(test_samples):
    entry_id = entry["id"]
    input_audio = entry["audio"]["array"]
    processed_audio = extractor(input_audio, sampling_rate=16000, return_tensors="pt", padding=True)
    processed_audio = {key: value.to(device) for key, value in processed_audio.items()}

    with torch.no_grad():
        pred_logits = audio_model(**processed_audio).logits

    pred_probs = softmax(pred_logits.cpu().numpy(), axis=-1)
    tts_probability = round(pred_probs[0, 1], 3)
    submission_list.append([entry_id, tts_probability])


In [None]:
submission_df = pd.DataFrame(submission_list, columns=["id", "is_tts"])
submission_df.to_csv("./final_submission.csv", index=False)

print("Submission file successfully saved!")
