In [13]:
from datasets import load_dataset
from torchvision import transforms
from PIL import Image
from transformers import ASTForAudioClassification, TrainingArguments, Trainer, ASTFeatureExtractor
import evaluate
import numpy as np
import accelerate
import pandas as pd
from datasets import Dataset, Audio, ClassLabel

In [14]:
# Load your CSV file into a pandas DataFrame
df = pd.read_csv('../data/raw/1000_test_class.csv')

# Verify column names
print(df.head())  # Ensure columns are named correctly (e.g., 'audio' and 'label')

# Rename columns if necessary
df.rename(columns={'image_path': 'audio', 'class': 'label'}, inplace=True)

# Ensure the labels are integers or strings
df['label'] = df['label'].astype(str)

# Create a Dataset from the DataFrame
dataset = Dataset.from_pandas(df)

# Cast the 'audio' and 'label' columns to their respective feature types
dataset = dataset.cast_column('audio', Audio(sampling_rate=44100))
dataset = dataset.cast_column('label', ClassLabel(names=list(df['label'].unique())))

# Verify the dataset structure
print(dataset)


                                          image_path  class
0  ../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_s...      0
1  ../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_s...      0
2  ../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_s...      1
3  ../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_s...      1
4  ../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_s...      2


Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 10
})


In [15]:
print(dataset.features["label"])

ClassLabel(names=['0', '1', '2', '3', '4'], id=None)


In [16]:
from datasets import ClassLabel

# Convert labels to ClassLabel (ensures integer encoding)
unique_labels = sorted(df["label"].unique().tolist())  # Get sorted unique labels
dataset = dataset.cast_column("label", ClassLabel(names=unique_labels))

# Now split into train/validation sets
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

Casting the dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

In [17]:
# Load the feature extractor from a pretrained AST model
feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593",
                                                         sampling_rate=44100,
                                                         num_mels_bins=512
                                                            )

In [18]:
import os

audio_path = dataset[0]["audio"]  # The first entry’s audio path
print("First audio path:", audio_path)

First audio path: {'path': '../data/raw/1000_test/0ahmam4Hqa4hZD1QbUop13_segment_1.wav', 'array': array([-0.14781189, -0.14898682, -0.15419006, ...,  0.03382874,
        0.01242065, -0.00569153]), 'sampling_rate': 44100}


In [19]:
#def preprocess_function(batch):
#    # "batch['audio']" is a dictionary with keys: {"array", "path", "sampling_rate"} 
#    audio_array = batch["audio"]["array"]
#    sampling_rate = batch["audio"]["sampling_rate"]
#    
#    # Apply the feature extractor
#    # return_tensors='np' so we get NumPy arrays rather than PyTorch Tensors 
#    # (the Trainer will collate them into PyTorch tensors)
#    inputs = feature_extractor(
#        audio_array, 
#        sampling_rate=sampling_rate,
#        return_attention_mask=True,
#        return_tensors="np"
#    )
#    batch["input_values"] = inputs["input_values"][0]
#    batch["attention_mask"] = inputs["attention_mask"][0] if "attention_mask" in inputs else None
#    return batch
#
#train_dataset = train_dataset.map(preprocess_function)
#eval_dataset = eval_dataset.map(preprocess_function)
#

In [20]:
import librosa
import numpy as np

def preprocess_function(batch):
    audio_array = batch["audio"]["array"]
    sampling_rate = batch["audio"]["sampling_rate"]

    # 🔹 Resample to 44.1 kHz if needed
    if sampling_rate != 44100:
        audio_array = librosa.resample(audio_array, orig_sr=sampling_rate, target_sr=44100)

    # 🔹 Ensure audio is exactly 5 seconds long
    target_length = 5 * 44100  # 5 seconds * 44.1 kHz
    if len(audio_array) < target_length:
        audio_array = np.pad(audio_array, (0, target_length - len(audio_array)), mode="constant")
    else:
        audio_array = audio_array[:target_length]

    # 🔹 Convert audio to spectrogram using AST feature extractor
    inputs = feature_extractor(
        audio_array, 
        sampling_rate=44100, 
        return_attention_mask=True,
        return_tensors="np"
    )

    batch["input_values"] = inputs["input_values"][0]
    batch["attention_mask"] = inputs["attention_mask"][0] if "attention_mask" in inputs else None

    # 🔹 Ensure labels are cast to int64 (torch.long)
    #batch["label"] = np.int64(batch["label"])  # ✅ Convert to int64
    # ✅ Ensure Labels are Scalars and Converted to int64
    batch["label"] = np.int64(batch["label"]).item()  # ✅ Fixes "list" issue
    # transofrm label to type torch.LongTensor
    #batch["label"] = torch.LongTensor([batch["label"]])

    return batch

train_dataset = train_dataset.map(preprocess_function)
eval_dataset = eval_dataset.map(preprocess_function)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [21]:
# 🔹 Check dataset labels before training
print("Sample label:", train_dataset[0]["label"])
print("Label type:", type(train_dataset[0]["label"]))  # Should be <class 'numpy.int64'>


Sample label: 0
Label type: <class 'int'>


In [None]:
num_labels = len(unique_labels)
model = ASTForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)

In [21]:
# 9. Define training args
training_args = TrainingArguments(
    output_dir="./ast-finetune-output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)



In [None]:
# ✅ 6. Define Accuracy Metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)


# ✅ 7. Custom Data Collator to Ensure Labels are `torch.long`
def data_collator(features):
    input_values = torch.tensor([f["input_values"] for f in features], dtype=torch.float32)
    labels = torch.tensor([f["label"] for f in features], dtype=torch.long)  # ✅ Ensures labels are int64
    return {"input_values": input_values, "labels": labels}

In [22]:
# ✅ 8. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=data_collator  # ✅ Fixes label dtype issue
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [23]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbabisbabis[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.384998321533203, 'eval_accuracy': 0.0, 'eval_runtime': 0.7358, 'eval_samples_per_second': 2.718, 'eval_steps_per_second': 1.359, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.31242036819458, 'eval_accuracy': 0.0, 'eval_runtime': 0.7088, 'eval_samples_per_second': 2.822, 'eval_steps_per_second': 1.411, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.309208393096924, 'eval_accuracy': 0.0, 'eval_runtime': 0.8284, 'eval_samples_per_second': 2.414, 'eval_steps_per_second': 1.207, 'epoch': 3.0}
{'train_runtime': 75.2229, 'train_samples_per_second': 0.319, 'train_steps_per_second': 0.08, 'train_loss': 1.3194740613301594, 'epoch': 3.0}


TrainOutput(global_step=6, training_loss=1.3194740613301594, metrics={'train_runtime': 75.2229, 'train_samples_per_second': 0.319, 'train_steps_per_second': 0.08, 'total_flos': 1626831700623360.0, 'train_loss': 1.3194740613301594, 'epoch': 3.0})

###### end of my trial without blog help

In [24]:
# Define features with audio and label columns
features = Features({
    "audio": Audio(),  # Define the audio feature
    "labels": class_labels  # Assign the class labels
})

dataset = Dataset.from_dict({
    "audio": dataset,
    "labels": [0, 1],  # Corresponding labels for the audio files
}, features=features)

NameError: name 'Features' is not defined

In [None]:
# Load the feature extractor
feature_extractor = ASTFeatureExtractor.from_pretrained('MIT/ast-finetuned-audioset-10-10-0.4593')

# Define a preprocessing function
def preprocess(batch):
    # Load audio             
    atch['audio']
    # Extract features
    inputs = feature_extractor(audio['array'], sampling_rate=audio['sampling_rate'], return_tensors='pt')
    batch['input_values'] = inputs.input_values[0]
    return batch

# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess)


In [None]:
# Adjust paths to your CSV file
dataset = load_dataset("csv", data_files="../data/processed/L1000dataset_5seg_valence.csv")

# load dataset into a Dataset object
dataset = dataset["train"]

{'image_path': '../data/processed/1000dataset_5/specs\\000RDCYioLteXcutOjeweY_segment_2.png', 'class': 2}


In [10]:
# we define which pretrained model we want to use and instantiate a feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

# we save model input name and sampling rate for later use
model_input_name = feature_extractor.model_input_names[0]  # key -> 'input_values'
SAMPLING_RATE = feature_extractor.sampling_rate

In [5]:
# Preprocessing function for dataset
def preprocess_images(example):
    image = Image.open(example["image_path"]).convert("RGB")  # Open image as RGB
    example["image"] = image_transform(image)  # Apply transformations
    return example

# Apply preprocessing
dataset = dataset.map(preprocess_images)

# Rename 'class' column to 'label' (required by Trainer)
dataset = dataset.rename_column("class", "label")

# Set dataset format for PyTorch
dataset.set_format(type="torch", columns=["image", "label"])

In [7]:
# Metrics for evaluation
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    metrics = {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    }
    return metrics

In [25]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./ast_finetuned",       # Where to save the model
    #evaluation_strategy="epoch",       # Evaluate after each epoch
    save_strategy="epoch",             # Save the model after each epoch
    learning_rate=3e-5,                # Learning rate
    per_device_train_batch_size=16,    # Train batch size
    per_device_eval_batch_size=16,     # Eval batch size
    gradient_accumulation_steps=2,     # Accumulate gradients to simulate larger batch sizes
    num_train_epochs=10,               # Number of epochs
    warmup_ratio=0.1,                  # Warmup learning rate scheduler
    logging_steps=10,                  # Log every 10 steps
    #load_best_model_at_end=True,       # Load best model at the end of training
    metric_for_best_model="f1",        # Use F1-score to evaluate best model
    save_total_limit=2,                # Limit saved models
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    #eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

# Ensure W&B is initialized with the desired project
#import wandb
#wandb.init(project="Audio_Class")

In [22]:
print(accelerate.__version__)

0.28.0


In [25]:
#Train the model
trainer.train()

# create d
# Save the fine-tuned model
model.save_pretrained("./ast_finetuned")

  0%|          | 0/6 [00:00<?, ?it/s]

RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'