# Imports and Prerequisites

In [None]:
# ! pip install datasets[audio]
! pip -q install accelerate -U
! pip -q install librosa==0.9.2
! pip -q install numpy==1.23.5
! pip -q install datasets==2.15
from datasets import Dataset, Audio
import librosa
import librosa.display
import IPython.display as ipd
import warnings
warnings.filterwarnings('ignore')
import torch

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# Kaggle Specific Working Directory

In [None]:
# delete all the files and directories recursively in the current working directory ...

!rm -rf *

# make directory ...

!mkdir /kaggle/working/datasets
!mkdir /kaggle/working/datasets/train
!mkdir /kaggle/working/datasets/test

#  reference original files without duplicating their content ...

def all_files_in_folder_symlink(source_dir, target_dir):
    files = os.listdir(source_dir)
    
    for file in tqdm(files):
        source_file = os.path.join(source_dir, file)
        target_file = os.path.join(target_dir, file)
        os.symlink(source_file, target_file)
# symbolic link function as above ...


all_files_in_folder_symlink("/kaggle/input/biomed-datathon-bmefest2/train","/kaggle/working/datasets/train")
all_files_in_folder_symlink("/kaggle/input/biomed-datathon-bmefest2/test","/kaggle/working/datasets/test")
os.symlink("/kaggle/input/biomed-datathon-bmefest2/additional_metadata.csv", "/kaggle/working/datasets/additional_metadata.csv")
os.symlink("/kaggle/input/biomed-datathon-bmefest2/sample_submission.csv", "/kaggle/working/datasets/sample_submission.csv")
os.symlink("/kaggle/input/biomed-datathon-bmefest2/test_files.csv", "/kaggle/working/datasets/test_files.csv")
os.symlink("/kaggle/input/biomed-datathon-bmefest2/train.csv", "/kaggle/working/datasets/train.csv")

os.symlink ("/kaggle/input/biomed-datathon-bmefest2/train/085_sit_Tri6_06.wav", "/kaggle/working/datasets/train/085_sit_Tri.wav")

train_dir_path = "/kaggle/working/datasets/train"
test_dir_path = "/kaggle/working/datasets/test"

In [None]:
# train_dir_path = "train"
# test_dir_path = "test"

In [None]:
! tree -d

# Loading Data

In [None]:
# data = Dataset.from_csv("/kaggle/working/datasets/train.csv")
data = pd.read_csv("/kaggle/working/datasets/train.csv")
# data = pd.read_csv("train.csv")

# train test split
from sklearn.model_selection import train_test_split
train_data, valid_data = train_test_split(data, test_size=0.1, random_state=42)

train_data = Dataset.from_pandas(train_data)
valid_data = Dataset.from_pandas(valid_data)

test_data = pd.read_csv("/kaggle/working/datasets/test_files.csv")
# test_data = pd.read_csv("test_files.csv")
test_data = Dataset.from_pandas(test_data)

# File Path to Audio Mapping

In [None]:
def from_train_path_to_audio(path):
    path = str(train_dir_path) + "/" + path + ".wav"
    aud, sr = librosa.load(path, sr=None)
    return dict({"array": aud, "sampling_rate": sr})

def from_test_path_to_audio(path):
    path = str(test_dir_path) + "/" + path + ".wav"
    aud, sr = librosa.load(path, sr=None)
    return dict({"array": aud, "sampling_rate": sr})


In [None]:
recording_columns = ["recording_1", "recording_2", "recording_3", "recording_4", "recording_5", "recording_6", "recording_7", "recording_8"]
labels = ["AS", "AR", "MR", "MS", "N"]

# for each recording column, replace the recording column element path with the audio object using the from_train_path_to_audio function and create a new dataset
for recording_column in recording_columns:
    train_data = train_data.map(lambda x: {recording_column: from_train_path_to_audio(x[recording_column])}, remove_columns=[recording_column])

# do the same for valid data
for recording_column in recording_columns:
    valid_data = valid_data.map(lambda x: {recording_column: from_train_path_to_audio(x[recording_column])}, remove_columns=[recording_column])

for recording_column in recording_columns:
    test_data = test_data.map(lambda x: {recording_column: from_test_path_to_audio(x[recording_column])}, remove_columns=[recording_column])


# Resampling to 16KHz

In [None]:
# convert to 16kHz
for recording_column in recording_columns:
    train_data = train_data.cast_column(recording_column, Audio(sampling_rate=16000))
train_data[0]

# same for valid data
for recording_column in recording_columns:
    valid_data = valid_data.cast_column(recording_column, Audio(sampling_rate=16000))
valid_data[0]

# same for valid data
for recording_column in recording_columns:
    test_data = test_data.cast_column(recording_column, Audio(sampling_rate=16000))
test_data[0]

# Creating Recording Wise Datasets

In [None]:
# there are 8 recordings for each row, we will create 8 different datasets for each recording, and drop the other recordings
train_datasets = []
for recording_column in recording_columns:
    train_dataset = train_data.map(lambda x: {"audio": x[recording_column]}, remove_columns=recording_columns)
    train_datasets.append(train_dataset)
train_datasets[0]

# same for valid data
valid_datasets = []
for recording_column in recording_columns:
    valid_dataset = valid_data.map(lambda x: {"audio": x[recording_column]}, remove_columns=recording_columns)
    valid_datasets.append(valid_dataset)
valid_datasets[0]

# same for valid data
test_datasets = []
for recording_column in recording_columns:
    test_dataset = test_data.map(lambda x: {"audio": x[recording_column]}, remove_columns=recording_columns)
    test_datasets.append(test_dataset)
test_datasets[0]

In [None]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
label2id

In [None]:
new_train = train_datasets
new_valid = valid_datasets
new_test = test_datasets

# Audio Spectrogram Transformer: Feature Extraction

In [None]:
# # # AST Feature Extractor from transformer
! pip -q install transformers
from transformers import ASTFeatureExtractor, AutoModelForAudioClassification
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

In [None]:
def prepare_dataset(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    features = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate, truncation=True, return_attention_mask=True)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(audio_arrays), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    features["labels"] = labels_matrix.tolist()
    return features

# for dataset in datasets:
#     dataset = dataset.map(prepare_dataset)

In [None]:
def prepare_testset(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    features = feature_extractor(audio_arrays, sampling_rate=feature_extractor.sampling_rate, truncation=True, return_attention_mask=True)
    # add labels
#     labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
#     # create numpy array of shape (batch_size, num_labels)
#     labels_matrix = np.zeros((len(audio_arrays), len(labels)))
#     # fill numpy array
#     for idx, label in enumerate(labels):
#         labels_matrix[:, idx] = labels_batch[label]
#     features["labels"] = labels_matrix.tolist()
    return features

# Batched Data Processing

In [None]:
for i in range(len(new_train)):
    new_train[i] = new_train[i].map(prepare_dataset, batched=True, batch_size=100, remove_columns=["audio", "AS", "AR", "MR", "MS", "N"], num_proc=1)

In [None]:
for i in range(len(new_valid)):
    new_valid[i] = new_valid[i].map(prepare_dataset, batched=True, batch_size=100, remove_columns=["audio", "AS", "AR", "MR", "MS", "N"], num_proc=1)

In [None]:
for i in range(len(new_test)):
    new_test[i] = new_test[i].map(prepare_testset, batched=True, batch_size=100, remove_columns=["audio"], num_proc=1)

# Model and Hyperparameters

In [None]:
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True)

In [None]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 4
gradient_accumulation_steps = 1
num_train_epochs = 10 # originally used 50 and 100 in local setting, increasing after 20 might crash in kaggle

training_args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
)

# Metrics

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

# Training and Validation

A weights and bias account must be activated to continue the training authorization process

In [None]:
import gc
pt_ids = new_test[0]['patient_id']
pt_ids

In [None]:
from transformers import Trainer

trainer0 = Trainer(
    model,
    training_args,
    train_dataset=new_train[0],
    eval_dataset=new_valid[0],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer0.train()

eval_f1_0 = trainer0.evaluate()['eval_f1']
result = trainer0.predict(new_test[0])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions0 = np.zeros(probs.shape)
predictions0[np.where(probs >= 0.5)] = 1
predictions0 = predictions0.astype(int)

trainer1 = Trainer(
    model,
    training_args,
    train_dataset=new_train[1],
    eval_dataset=new_valid[1],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer1.train()

eval_f1_1 = trainer1.evaluate()['eval_f1']
result = trainer1.predict(new_test[1])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions1 = np.zeros(probs.shape)
predictions1[np.where(probs >= 0.5)] = 1
predictions1 = predictions1.astype(int)


trainer2 = Trainer(
    model,
    training_args,
    train_dataset=new_train[2],
    eval_dataset=new_valid[2],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer2.train()

eval_f1_2 = trainer2.evaluate()['eval_f1']
result = trainer2.predict(new_test[2])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions2 = np.zeros(probs.shape)
predictions2[np.where(probs >= 0.5)] = 1
predictions2 = predictions2.astype(int)


trainer3 = Trainer(
    model,
    training_args,
    train_dataset=new_train[3],
    eval_dataset=new_valid[3],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer3.train()

eval_f1_3 = trainer3.evaluate()['eval_f1']
result = trainer3.predict(new_test[3])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions3 = np.zeros(probs.shape)
predictions3[np.where(probs >= 0.5)] = 1
predictions3 = predictions3.astype(int)


trainer4 = Trainer(
    model,
    training_args,
    train_dataset=new_train[4],
    eval_dataset=new_valid[4],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer4.train()

eval_f1_4 = trainer4.evaluate()['eval_f1']
result = trainer4.predict(new_test[4])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions4 = np.zeros(probs.shape)
predictions4[np.where(probs >= 0.5)] = 1
predictions4 = predictions4.astype(int)


trainer5 = Trainer(
    model,
    training_args,
    train_dataset=new_train[5],
    eval_dataset=new_valid[5],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer5.train()

eval_f1_5 = trainer5.evaluate()['eval_f1']
result = trainer5.predict(new_test[5])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions5 = np.zeros(probs.shape)
predictions5[np.where(probs >= 0.5)] = 1
predictions5 = predictions5.astype(int)


trainer6 = Trainer(
    model,
    training_args,
    train_dataset=new_train[6],
    eval_dataset=new_valid[6],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer6.train()

eval_f1_6 = trainer6.evaluate()['eval_f1']
result = trainer6.predict(new_test[6])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions6 = np.zeros(probs.shape)
predictions6[np.where(probs >= 0.5)] = 1
predictions6 = predictions6.astype(int)

trainer7 = Trainer(
    model,
    training_args,
    train_dataset=new_train[7],
    eval_dataset=new_valid[7],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer7.train()

eval_f1_7 = trainer7.evaluate()['eval_f1']
result = trainer7.predict(new_test[7])
result = torch.tensor(result.predictions)
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(result.squeeze().cpu())
predictions7 = np.zeros(probs.shape)
predictions7[np.where(probs >= 0.5)] = 1
predictions7 = predictions7.astype(int)

# Predictions: Weighted Majority Voting

In [None]:
# print the f1 scores
print(f"eval_f1_0: {eval_f1_0}")
print(f"eval_f1_1: {eval_f1_1}")
print(f"eval_f1_2: {eval_f1_2}")
print(f"eval_f1_3: {eval_f1_3}")
print(f"eval_f1_4: {eval_f1_4}")
print(f"eval_f1_5: {eval_f1_5}")
print(f"eval_f1_6: {eval_f1_6}")
print(f"eval_f1_7: {eval_f1_7}")

In [None]:
# perform weighted majority voting of the predictions based on the trainer f1 scores
predictions = (predictions0 * eval_f1_0 + predictions1 * eval_f1_1 + predictions2 * eval_f1_2 + predictions3 * eval_f1_3 + predictions4 * eval_f1_4 + predictions5 * eval_f1_5 + predictions6 * eval_f1_6 + predictions7 * eval_f1_7) / (eval_f1_0 + eval_f1_1 + eval_f1_2 + eval_f1_3 + eval_f1_4 + eval_f1_5 + eval_f1_6 + eval_f1_7)
predictions = (predictions >= 0.5).astype(int)

In [None]:
# if the first 4 predictions are 0 for a row, then the final prediction must be 1 for that row
for i in range(predictions.shape[0]):
    if np.sum(predictions[i, :4]) == 0:
        predictions[i, 4] = 1 

In [None]:
predictions

In [None]:
# create a dataframe with the predictions
submission = pd.DataFrame(predictions, columns=labels)
submission["patient_id"] = pt_ids
submission = submission[["patient_id", "AS", "AR", "MR", "MS", "N"]]
submission

# save the submission
submission.to_csv("submission_AST_majority_voting_on_records.csv", index=False)

