In [1]:
# Example: if folder is named 'asvspoof2019-dataset'
import os
dataset_path = '/kaggle/input/asvpoof-2019-dataset'

# List files/folders inside this dataset
print(os.listdir(dataset_path))
 

['asvspoof2019_Interspeech2019_submission.pdf', 'LICENSE_text.txt', 'README.txt', 'PA', 'asvspoof2019_evaluation_plan.pdf', 'LA']


In [2]:
!git clone https://github.com/grip-unina/PoIForensics-Audio

%cd PoIForensics-Audio
%pip install -q -r requirements.txt       # python-3.10 on Kaggle works fine

# Kaggle already ships with PyTorch + CUDA 12, so we relax the pinned
# versions in requirements.txt and install everything else.
!pip install --upgrade --quiet torch torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/cu118         # PyTorch 2.2 at the time of writing

# pydub needs ffmpeg to decode .wav / .flac
!sudo apt-get update -y && sudo apt-get install -y ffmpeg

# now the repo’s extras (torch is already satisfied)
!pip install --quiet -r requirements.txt


Cloning into 'PoIForensics-Audio'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 36 (delta 5), reused 7 (delta 2), pack-reused 19 (from 1)[K
Receiving objects: 100% (36/36), 173.89 MiB | 41.75 MiB/s, done.
Resolving deltas: 100% (6/6), done.
Updating files: 100% (14/14), done.
/kaggle/working/PoIForensics-Audio
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
import os, pandas as pd

# ── fixed folder pointers ────────────────────────────────────────────
base_dir  = '/kaggle/input/asvpoof-2019-dataset/LA/LA'
audio_dir = f'{base_dir}/ASVspoof2019_LA_eval/flac'
protocol  = f'{base_dir}/ASVspoof2019_LA_cm_protocols/' \
            f'ASVspoof2019.LA.cm.eval.trl.txt'

rows = []
with open(protocol) as fp:
    for line in fp:
        parts    = line.strip().split()
        speaker  = parts[0]          # e.g. LA_0039
        utt      = parts[1]          # e.g. LA_E_1000147  ✅ real file name
        tag      = parts[-1]         # bonafide / spoof

        rows.append([utt,
                     f'{audio_dir}/{utt}.flac',
                     speaker,
                     'eval',
                     0 if tag == 'bonafide' else 1,
                     1, 1])

cols = ['videoname','filepath','poi','context','label','in_tst','in_ref']
df   = pd.DataFrame(rows, columns=cols)
df.to_csv('../ASVspoof2019_eval.csv', index=False)

# quick sanity-check
missing = df.loc[~df['filepath'].apply(os.path.exists)].shape[0]
print(f"Wrote {len(df):,} rows → ../ASVspoof2019_eval.csv")
print("Missing files:", missing)


Wrote 71,237 rows → ../ASVspoof2019_eval.csv
Missing files: 0


In [4]:
import os, pandas as pd

# ----------  where Kaggle mounted the three splits  -----------------
BASE = '/kaggle/input/asvpoof-2019-dataset/LA/LA'
SPLITS = {
    'train': ('ASVspoof2019_LA_train', 'ASVspoof2019.LA.cm.train.trn.txt'),
    'dev'  : ('ASVspoof2019_LA_dev',   'ASVspoof2019.LA.cm.dev.trl.txt'),
    'eval' : ('ASVspoof2019_LA_eval',  'ASVspoof2019.LA.cm.eval.trl.txt'),
}

rows = []
for ctx, (folder, proto) in SPLITS.items():
    audio_dir  = f'{BASE}/{folder}/flac'
    proto_file = f'{BASE}/ASVspoof2019_LA_cm_protocols/{proto}'

    with open(proto_file) as f:
        for line in f:
            _, utt, _, _, tag = line.strip().split()      # 2nd column = file-ID
            rows.append([
                utt,                                      # videoname
                f'{audio_dir}/{utt}.flac',                # filepath
                'asvspoof',                               # ← ONE global POI
                ctx,                                      # context
                0 if tag == 'bonafide' else 1,            # label
                1 if ctx == 'eval' else 0,                # in_tst
                1 if tag == 'bonafide' else 0,            # in_ref (bonafide)
            ])

cols = ['videoname','filepath','poi','context','label','in_tst','in_ref']
df   = pd.DataFrame(rows, columns=cols)
df.to_csv('../ASVspoof2019_global.csv', index=False)

print(df.context.value_counts())           # sanity-check
print("Missing files:", (~df.filepath.apply(os.path.exists)).sum())
print("Reference rows all bonafide:",
      df.query('in_ref==1')['label'].unique())


context
eval     71237
train    25380
dev      24844
Name: count, dtype: int64
Missing files: 0
Reference rows all bonafide: [0]


In [None]:
!python /kaggle/working/PoIForensics-Audio/extract_features.py \
    --dataset-csv /kaggle/working/ASVspoof2019_global.csv \
    --dataset-name ASVspoof2019_global \
    --weights /kaggle/working/PoIForensics-Audio/checkpoints/model_no_augmentation.th \
    --batch-size 64 --seconds 4 --gpu 0 --num-workers 4


Device is cuda:0
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|███████████████████████████████████████| 97.8M/97.8M [00:00<00:00, 275MB/s]
Checkpoint Loaded!
Extracting features:   5%|▋             | 5504/121461 [08:14<2:49:17, 11.42it/s]

In [None]:
!ls -lh /kaggle/working/*.csv


In [None]:
 !python /kaggle/working/PoIForensics-Audio/compute_distances.py \
    --dataset-csv /kaggle/working/ASVspoof2019_global.csv \
    --dataset-name ASVspoof2019_global \
    --strategy ms

In [None]:
import pandas as pd

# Load full score file
scores_full = pd.read_csv('/kaggle/working/PoIForensics-Audio/scores/ASVspoof2019_global_ms.csv', index_col=0)

# Load original manifest
manifest_full = pd.read_csv('/kaggle/working/ASVspoof2019_global.csv')

# Merge to get labels back
merged_full = scores_full.reset_index().merge(manifest_full[['videoname', 'label']], on='videoname', how='left')

# Set index back (optional)
merged_full = merged_full.set_index('videoname')

print("✅ Merged full scores shape:", merged_full.shape)
print("Label distribution:\n", merged_full['label'].value_counts())


In [None]:
# Load full score file
scores_full = pd.read_csv('/kaggle/working/PoIForensics-Audio/scores/ASVspoof2019_global_ms.csv', index_col=0)

# Load manifest WITH context and check columns
manifest_full = pd.read_csv('/kaggle/working/ASVspoof2019_global.csv')
print("Manifest columns:", manifest_full.columns.tolist())

# Select correct columns
manifest_full = manifest_full[['videoname', 'label', 'context']]

# Merge
merged_full = scores_full.reset_index().merge(manifest_full, on='videoname', how='left').set_index('videoname')

# Check merged columns
print("Merged columns:", merged_full.columns.tolist())


In [None]:
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score

# 1️⃣ Load scores
scores_full = pd.read_csv('/kaggle/working/PoIForensics-Audio/scores/ASVspoof2019_global_ms.csv', index_col=0)

# 2️⃣ Load manifest
manifest_full = pd.read_csv('/kaggle/working/ASVspoof2019_global.csv')[['videoname', 'label', 'context']]

# 3️⃣ Merge
merged_full = scores_full.reset_index().merge(manifest_full, on='videoname', how='left').set_index('videoname')

print("✅ Merged columns:", merged_full.columns.tolist())

# 4️⃣ Use context_y for filtering (from manifest)
merged_eval = merged_full[merged_full['context_y'] == 'eval']

# 1⃣  Make a *new* label array where “1” means BONAFIDE (real speech)
y_true_bona = (merged_eval['label'] == 0).astype(int).values   # 1 = bonafide, 0 = spoof

# 2⃣  The score stays “−distance” (higher ⇒ more bonafide)
y_score = -merged_eval['value'].values

# 3⃣  ROC, EER, AUC
fpr, tpr, _ = roc_curve(y_true_bona, y_score)          # no pos_label needed
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]
auc = roc_auc_score(y_true_bona, y_score)

print(f"✅ EER  = {eer*100:.4f}%")
print(f"✅ AUC  = {auc*100:.4f}%")


In [None]:
# ---------- PATCH compute_distances.py  ---------------------------------
import pathlib, re, shutil

SRC = pathlib.Path('/kaggle/working/PoIForensics-Audio/compute_distances.py')
DST = SRC.parent / 'compute_distances_check.py'

# 1️⃣ copy the original script
shutil.copyfile(SRC, DST)

# 2️⃣ read & patch two lines:
#   a) drop the (df['context'] != context) filter
#   b) allow references from *all* contexts when concatenating
with open(DST, 'r') as f:
    code = f.read()

code = re.sub(r"\(df\['in_ref'\]\s*==\s*1\)\s*&\s*\(df\['context'\]\s*!=\s*con-

code = code.replace(
    "[k for k in list_dict if k != context]",
    "[k for k in list_dict]"  # keep every context
)

with open(DST, 'w') as f:
    f.write(code)

print("✅ Patched script written to:", DST)


In [None]:
!python /kaggle/working/PoIForensics-Audio/compute_distances_check.py \
    --dataset-csv /kaggle/working/ASVspoof2019_global.csv \
    --dataset-name ASVspoof2019_global \
    --strategy ms


In [None]:
import pandas as pd, numpy as np
from sklearn.metrics import roc_curve, roc_auc_score

# 1⃣  Load data
scores_full = pd.read_csv(
    '/kaggle/working/PoIForensics-Audio/scores/ASVspoof2019_global_ms.csv',
    index_col=0
)
manifest = pd.read_csv(
    '/kaggle/working/ASVspoof2019_global.csv',
    usecols=['videoname', 'label', 'context']
)

# 2⃣  Merge and keep context_y
merged = (
    scores_full.reset_index()
    .merge(manifest, on='videoname', how='left')        # adds context
    .set_index('videoname')
)

# 3⃣  Filter eval rows via context_y
eval_set = merged[merged['context_y'] == 'eval']

# 4⃣  Prepare labels (1 = bonafide) and scores (larger = more bonafide)
y_true  = (eval_set['label'] == 0).astype(int).values
y_score = -eval_set['value'].values

# 5⃣  Compute EER & AUC
fpr, tpr, _ = roc_curve(y_true, y_score)
eer = fpr[np.argmin(np.abs(fpr - (1 - tpr)))]
auc = roc_auc_score(y_true, y_score)

print(f"✅ NEW EER  = {eer*100:.4f}%")
print(f"✅ NEW AUC  = {auc*100:.4f}%")


# HuBERT 

In [None]:
# Install torchaudio (Kaggle already has torch 2.x CUDA 12)
!pip install --quiet torchaudio==2.1.2

import torch, torchaudio, os, glob, tqdm, numpy as np, pandas as pd
device = torch.device("cuda:0")

# Load Facebook HuBERT Base (pre-trained on LibriSpeech 960 h)
bundle = torchaudio.pipelines.HUBERT_BASE
hubert = bundle.get_model().to(device).eval()
print("HuBERT ⇒", bundle._params)


# Extracting file-level HuBERT embeddings

In [None]:
CSV = '/kaggle/working/ASVspoof2019_global.csv'
OUT_DIR = '/kaggle/working/hubert_feats'
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(CSV)
wav_list = df['filepath'].tolist()

def embed_one(path):
    wav, sr = torchaudio.load(path)
    if sr != bundle.sample_rate:
        wav = torchaudio.functional.resample(wav, sr, bundle.sample_rate)
    with torch.no_grad():
        w = wav.to(device)
        feat, _ = hubert.extract_features(w)   # list of layer outputs
        x = feat[-1].squeeze(0).mean(0).cpu().numpy()  # (768,)
    np.save(os.path.join(OUT_DIR, os.path.basename(path)+'.npy'), x)

for p in tqdm.tqdm(wav_list, desc="HuBERT embedding"):
    out_f = os.path.join(OUT_DIR, os.path.basename(p)+'.npy')
    if os.path.isfile(out_f):        # resume capability
        continue
    embed_one(p)


# Building train / dev (fit) and eval (test) matrices

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Helper to load *.npy → list of vectors
def load_split(ctx):
    subset = df[df.context == ctx]
    X = np.stack([np.load(f"{OUT_DIR}/{os.path.basename(p)}.npy")
                  for p in subset['filepath']])
    y = subset['label'].values          # 0 = bonafide, 1 = spoof
    return X, y

X_train, y_train = load_split('train')
X_dev,   y_dev   = load_split('dev')
X_eval,  y_eval  = load_split('eval')

X_fit   = np.concatenate([X_train, X_dev], 0)
y_fit   = np.concatenate([y_train, y_dev], 0)

# Shuffle & standardize
X_fit, y_fit = shuffle(X_fit, y_fit, random_state=42)
scaler = StandardScaler().fit(X_fit)
X_fit  = scaler.transform(X_fit)
X_eval = scaler.transform(X_eval)


# Train an RBF - kernel SVM

In [None]:
import time
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, roc_auc_score

start = time.time()

svm = SVC(
    C=10,
    gamma='scale',
    probability=True,
    class_weight='balanced',
    verbose=True        # ← prints iteration progress
)
svm.fit(X_fit, y_fit)

print(f"\n✅ Training finished in {(time.time() - start)/60:.1f} minutes")

# ---------- Evaluation ----------
bonafide_prob = svm.predict_proba(X_eval)[:, 0]   # class “0” = bonafide
y_true  = (y_eval == 0).astype(int)               # 1 = bonafide
y_score = bonafide_prob

fpr, tpr, _ = roc_curve(y_true, y_score)
eer = fpr[np.argmin(abs(fpr - (1 - tpr)))]
auc = roc_auc_score(y_true, y_score)

print(f"EER  = {eer*100:.3f}%")
print(f"AUC  = {auc*100:.3f}%")


# LightGBM

In [None]:
!pip install --quiet lightgbm

Loading HuBERT embeddings

In [None]:
import os
import numpy as np
import pandas as pd

# Paths
CSV = '/kaggle/working/ASVspoof2019_global.csv'
FEAT_DIR = '/kaggle/working/hubert_feats'   # where your .npy files are stored

# Load manifest
df = pd.read_csv(CSV)

# Helper to load embeddings
def load_features(context):
    subset = df[df['context'] == context]
    X = np.stack([
        np.load(os.path.join(FEAT_DIR, os.path.basename(p) + '.npy'))
        for p in subset['filepath']
    ])
    y = subset['label'].values  # 0 = bonafide, 1 = spoof
    return X, y

# Train/dev for fitting; eval for testing
X_train, y_train = load_features('train')
X_dev, y_dev = load_features('dev')
X_eval, y_eval = load_features('eval')

# Combine train+dev
import numpy as np
X_fit = np.vstack([X_train, X_dev])
y_fit = np.concatenate([y_train, y_dev])


In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Shuffle and scale
X_fit, y_fit = shuffle(X_fit, y_fit, random_state=42)
scaler = StandardScaler().fit(X_fit)
X_fit_scaled = scaler.transform(X_fit)
X_eval_scaled = scaler.transform(X_eval)

# LightGBM dataset
train_data = lgb.Dataset(X_fit_scaled, label=y_fit)

# Train params
params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': True,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'verbosity': -1,
    'boosting_type': 'gbdt'
}

# Train model
bst = lgb.train(params, train_data, num_boost_round=100)


In [None]:
# Predict: higher score = more likely spoof
y_prob = bst.predict(X_eval_scaled)
y_true = (y_eval == 0).astype(int)    # 1 = bonafide

# Invert scores so higher = bonafide
y_score = 1 - y_prob

# Compute EER & AUC
fpr, tpr, _ = roc_curve(y_true, y_score)
eer = fpr[np.argmin(np.abs(fpr - (1 - tpr)))]
auc = roc_auc_score(y_true, y_score)

print(f"✅ LightGBM + HuBERT → EER = {eer*100:.3f}%, AUC = {auc*100:.3f}%")


In [25]:
!rm /kaggle/working/working_directory.zip > /dev/null 2>&1


In [26]:
!zip -rq /kaggle/working/working_directory.zip /kaggle/working/*
