In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:

base_path = "/content/drive/MyDrive/genres_original"





In [3]:
import glob
import os
genres = sorted(os.listdir(base_path))


filepaths = []
labels = []

for idx , genre in enumerate(genres):
    files = glob.glob(f"{base_path}/{genre}/*.wav")
    for f in files :
        filepaths.append(f)
        labels.append(idx)

In [4]:
from sklearn.model_selection import train_test_split


X_temp, X_test, y_temp, y_test = train_test_split(
    filepaths, labels, test_size=0.15, random_state=42, stratify=labels)


X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)


print(len(X_train), "train")
print(len(X_val),   "val")
print(len(X_test),  "test")


699 train
151 val
150 test


In [5]:
import librosa
import audioread  # force fallback

def split_audio(file, segment_duration=3, sr=22050):
    try:
        signal, _ = librosa.load(file, sr=sr)
    except Exception as e:
        print("Erreur sur:", file, e)
        return []  # ignorer le fichier problématique

    samples_per_segment = sr * segment_duration
    segments = []

    for start in range(0, len(signal), samples_per_segment):
        end = start + samples_per_segment
        part = signal[start:end]
        if len(part) == samples_per_segment:
            segments.append(part)

    return segments


In [6]:
X_train_segments = []
y_train_segments = []

for idx, file in enumerate(X_train):
    segs = split_audio(file)
    X_train_segments.extend(segs)
    y_train_segments.extend([y_train[idx]] * len(segs))

print("Train segments:", len(X_train_segments))


Train segments: 6986


In [7]:
X_test_segments = []
y_test_segments = []

for idx, file in enumerate(X_test):
    segs = split_audio(file)
    X_test_segments.extend(segs)
    y_test_segments.extend([y_test[idx]] * len(segs))

print("Test segments:", len(X_test_segments))


  signal, _ = librosa.load(file, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Erreur sur: /content/drive/MyDrive/genres_original/jazz/jazz.00054.wav 
Test segments: 1489


In [8]:
X_val_segments = []
y_val_segments = []

for idx, file in enumerate(X_val):
    segs = split_audio(file)
    X_val_segments.extend(segs)
    y_val_segments.extend([y_val[idx]] * len(segs))

print("Val segments:", len(X_val_segments))


Val segments: 1506


In [9]:
import numpy as np
import librosa

def segment_to_logmel(segment, sr=22050, n_mels=128, n_fft=1024, hop_length=512):
    # Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=segment,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )

    # Log-mel
    log_mel = librosa.power_to_db(mel_spec, ref=np.max)

    # Normalisation
    log_mel_norm = (log_mel - log_mel.min()) / (log_mel.max() - log_mel.min())

    # Ajustement de la largeur à 128
    log_mel_norm = librosa.util.fix_length(log_mel_norm, size=128, axis=1)

    return log_mel_norm


In [10]:
X_train_mel = [segment_to_logmel(seg) for seg in X_train_segments]
X_val_mel  = [segment_to_logmel(seg) for seg in X_val_segments]
X_test_mel = [segment_to_logmel(seg) for seg in X_test_segments]


In [11]:
X_train_mel = np.array(X_train_mel)
X_val_mel   = np.array(X_val_mel)
X_test_mel  = np.array(X_test_mel)

y_train_segments = np.array(y_train_segments)
y_val_segments   = np.array(y_val_segments)
y_test_segments  = np.array(y_test_segments)


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class GTZANDataset(Dataset):
    def __init__(self, X, y):
        # X : numpy array des log-mels
        # y : numpy array des labels
        self.X = torch.tensor(X, dtype=torch.float32)  # convertir en tensor float
        self.y = torch.tensor(y, dtype=torch.long)     # labels int

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [13]:
batch_size = 32

train_dataset = GTZANDataset(X_train_mel, y_train_segments)
val_dataset   = GTZANDataset(X_val_mel, y_val_segments)
test_dataset  = GTZANDataset(X_test_mel, y_test_segments)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [14]:
!pip install wget
!pip install timm==0.4.5




In [15]:
import os

# chemin réel de tes poids
real_pretrained_dir = "/content/drive/MyDrive/ast/pretrained_models"

# créer un lien symbolique vers le chemin attendu par AST
!mkdir -p /content/../../  # assure que le dossier parent existe
!ln -sfn "{real_pretrained_dir}" /content/../../pretrained_models


In [16]:
import sys
sys.path.append("/content/drive/MyDrive/ast/src/models")

from ast_models import ASTModel

num_classes = 10

model = ASTModel(
    label_dim=num_classes,
    input_fdim=128,
    input_tdim=128,
    imagenet_pretrain=True,
    audioset_pretrain=True
)

print(model)


  @autocast()


---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=144
ASTModel(
  (v): DistilledVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU(approximate='none')

In [17]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [18]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/10, Loss: 1.0772
Epoch 2/10, Loss: 0.4409
Epoch 3/10, Loss: 0.2386
Epoch 4/10, Loss: 0.1630
Epoch 5/10, Loss: 0.0997
Epoch 6/10, Loss: 0.0750
Epoch 7/10, Loss: 0.0870
Epoch 8/10, Loss: 0.0543
Epoch 9/10, Loss: 0.0620
Epoch 10/10, Loss: 0.0381


In [19]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()  # mode évaluation
all_preds = []
all_labels = []

with torch.no_grad():  # pas de calcul de gradient
    for X_batch, y_batch in val_loader:  # ou test_loader
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, preds = torch.max(outputs, 1)  # classe prédite = argmax
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())




acc = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {acc*100:.2f}%")






Validation Accuracy: 81.41%
