
# **Multi-upload → 3s MFCC (13) → Train/Test CSV (~75/25) + Valgfri ZIP-eksport af segmenter**

**Understøttede inputformater:** `.m4a`, `.flac`, `.wav` (via `ffmpeg`)  
**Output:**  
- `training.csv`, `test.csv` med kolonner: `id, category, mfcc1..mfcc13`  
- *(valgfrit)* Alle segmenter gemmes i **én ZIP**: `segments_export.zip` med filer `Category-ID.wav`


In [None]:

# Installer nødvendige biblioteker og codecs
!apt-get -y install ffmpeg
!pip -q install librosa soundfile


In [None]:

# 1) Multi-upload af lydfiler (.m4a/.flac/.wav) i ét vindue
from google.colab import files
print("Vælg en eller flere lydfiler (.m4a/.flac/.wav)")
uploaded = files.upload()
if not uploaded:
    raise RuntimeError("Ingen filer uploadet.")
audio_files = list(uploaded.keys())
print("Indlæst filer:")
for f in audio_files:
    print(" -", f)


In [None]:

# 2) Indstillinger og import
import librosa
import numpy as np
import pandas as pd
import os, re, math, random, pathlib, soundfile as sf, zipfile, shutil

# Parametre
segment_sec = 3.0
n_mfcc = 13
use_partial_last_segment = False  # Sæt True for at medtage sidste kortere segment
seed = 42  # for reproducerbarhed
random.seed(seed)

# Fjern duplikat-suffix som " (1)" i slutningen af navnet
dup_suffix = re.compile(r"\s*\(\d+\)$")

# Spørg om segmenter skal gemmes i én ZIP
choice = input("Gem segmenter som WAV i en ZIP-fil? (ja/nej): ").strip().lower()
save_segments = (len(choice) > 0 and choice[0] in ["j", "y"])  # 'ja', 'ja tak', 'yes', 'y', etc.
print(f"ZIP-eksport af segmenter: {'AKTIVERET' if save_segments else 'DEAKTIVERET'}")

# Temp-mappe til WAV-segmenter før ZIP
tmp_dir = pathlib.Path("segments_tmp")
zip_name = "segments_export.zip"
if save_segments:
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True, exist_ok=True)


In [None]:

# 3) 3s segmenter + 13 MFCC → Train/Test split (~75/25 per fil) + (valgfri) ZIP-eksport
train_records = []
test_records  = []

# Tæller pr. kategori til generering af sekvens-ID'er
category_counters = {}
saved_segments_count = 0

for audio_filename in audio_files:
    base = os.path.basename(audio_filename)
    category = os.path.splitext(base)[0]
    category = dup_suffix.sub("", category)

    # Init counter for this category
    if category not in category_counters:
        category_counters[category] = 0

    # Indlæs lyd (native sampling rate, mono)
    y, sr = librosa.load(audio_filename, sr=None, mono=True)
    duration_sec = len(y) / sr
    print(f"Behandler {audio_filename} | sr={sr}, varighed={duration_sec:.2f}s | category={category}")

    samples_per_segment = int(round(segment_sec * sr))
    if use_partial_last_segment:
        num_segments = int(math.ceil(len(y) / samples_per_segment))
    else:
        num_segments = int(math.floor(len(y) / samples_per_segment))

    # Forbered segment-indeks og split per fil
    seg_indices = list(range(num_segments))
    random.shuffle(seg_indices)
    split_point = int(round(0.75 * len(seg_indices)))  # ca. 75%
    train_idx = set(seg_indices[:split_point])
    test_idx  = set(seg_indices[split_point:])

    for seg_idx in range(num_segments):
        start_sample = seg_idx * samples_per_segment
        end_sample = min(len(y), (seg_idx + 1) * samples_per_segment)
        segment = y[start_sample:end_sample]

        if len(segment) == 0:
            continue
        if (not use_partial_last_segment) and (len(segment) < samples_per_segment):
            continue

        # Opdatér sekvens-ID for denne kategori (1-baseret, nulutfyldt)
        category_counters[category] += 1
        seq_id = f"{category_counters[category]:05d}"  # 00001, 00002, ...

        # MFCC (13) og middelværdier over tid
        mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=n_mfcc)
        mfcc_means = mfcc.mean(axis=1)

        # Gem segment til temp som WAV (hvis valgt)
        if save_segments:
            out_name = f"{category}-{seq_id}.wav"
            out_path = tmp_dir / out_name
            sf.write(str(out_path), segment.astype(np.float32), sr)
            saved_segments_count += 1

        rec = {"id": seq_id, "category": category, **{f"mfcc{i+1}": float(mfcc_means[i]) for i in range(n_mfcc)}}

        if seg_idx in train_idx:
            train_records.append(rec)
        else:
            test_records.append(rec)

# DataFrames med samme kolonner (id først, derefter category)
columns = ["id", "category"] + [f"mfcc{i+1}" for i in range(n_mfcc)]
train_df = pd.DataFrame.from_records(train_records, columns=columns)
test_df  = pd.DataFrame.from_records(test_records,  columns=columns)

# Gem til CSV
train_csv = "training.csv"
test_csv  = "test.csv"
train_df.to_csv(train_csv, index=False)
test_df.to_csv(test_csv, index=False)

# Opret ZIP fra temp-mappen
if save_segments and tmp_dir.exists():
    with zipfile.ZipFile(zip_name, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(tmp_dir):
            for f in files:
                full_path = os.path.join(root, f)
                arcname = os.path.relpath(full_path, tmp_dir)  # kun navnet inde i ZIP
                zf.write(full_path, arcname)
    # Ryd op temp-mappe
    shutil.rmtree(tmp_dir)

print(f"training.csv rækker: {len(train_df)}")
print(f"test.csv rækker: {len(test_df)}")
if save_segments:
    print(f"Segmenter gemt i ZIP: {saved_segments_count}  → {zip_name}")


In [None]:

# 4) Download CSV-filerne og evt. ZIP
from google.colab import files
files.download("training.csv")
files.download("test.csv")
try:
    files.download("segments_export.zip")
except Exception as e:
    pass
