# Import filepaths

In [None]:
import os
import pandas as pd
import ast

# ---- Paths ----
metadata_path = "metadata.csv"           # change if needed
out_dir = "data/filters_fake_only"       # separate folder for this notebook
os.makedirs(out_dir, exist_ok=True)

# ---- Load CSV ----
df = pd.read_csv(metadata_path)
print(f"✅ Loaded metadata.csv with shape: {df.shape}")


# Index Columns

In [None]:
ist_columns = ["fake_periods", "timestamps"]

def safe_literal_eval(x):
    if pd.isna(x):
        return []
    if isinstance(x, (list, tuple)):
        return x
    x = str(x).strip()
    if x in ("", "[]", "None", "nan"):
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        # If parsing fails, return empty list to avoid crashing
        return []

for col in list_columns:
    df[col] = df[col].apply(safe_literal_eval)

print("✅ Parsed list-like columns:", list_columns)

# Ensure modify flags are boolean


In [None]:
df["modify_audio"] = df["modify_audio"].astype(bool)

# Ensure numeric fields are numeric
df["n_fakes"]  = pd.to_numeric(df["n_fakes"], errors="coerce").fillna(0).astype(int)
df["duration"] = pd.to_numeric(df["duration"], errors="coerce")

print("✅ Dtypes set. Summary:")
print(df.dtypes[["modify_audio","modify_video","n_fakes","duration"]])


# Binary modified label: 1 = fake (>=1 segment), 0 = real


In [None]:
df["modified"] = (df["n_fakes"] >= 1).astype(int)


In [None]:
# A/V combination label e.g. A1_V0

In [None]:
df["av_combo"] = df.apply(lambda r: f"A{int(r['modify_audio'])}_V{int(r['modify_video'])}", axis=1)

# Count of fake segments from fake_periods
df["fake_segment_count"] = df["fake_periods"].apply(lambda x: len(x) if x else 0)

# Total fake segment length (seconds)
def compute_total_fake_length(fake_periods):
    if not fake_periods:
        return 0.0
    return sum((end - start) for start, end in fake_periods)

df["total_fake_length"] = df["fake_periods"].apply(compute_total_fake_length)

print("✅ Engineered columns added: modified, av_combo, fake_segment_count, total_fake_length")
df[["file","n_fakes","modified","av_combo","fake_segment_count","total_fake_length"]].head()


In [None]:
# Focus only on rows that contain fake segments

In [None]:
is_fake_row = df["n_fakes"] >= 1

In [None]:
# Modality masks

In [None]:
audio_only  = (df["modify_audio"] == True)  & (df["modify_video"] == False)
video_only  = (df["modify_audio"] == False) & (df["modify_video"] == True)
both_fake   = (df["modify_audio"] == True)  & (df["modify_video"] == True)

# Duration threshold and masks
thr = 7.5  # seconds
short = df["duration"] <  thr
long_ = df["duration"] >= thr

print("✅ Masks prepared (fake-only, modality, duration).")

subsets = {
    # < 7.5s
    "A_only_lt7p5" : df.loc[is_fake_row & audio_only & short, :].copy(),
    "V_only_lt7p5" : df.loc[is_fake_row & video_only & short, :].copy(),
    "AV_both_lt7p5": df.loc[is_fake_row & both_fake  & short, :].copy(),
    # ≥ 7.5s
    "A_only_ge7p5" : df.loc[is_fake_row & audio_only & long_, :].copy(),
    "V_only_ge7p5" : df.loc[is_fake_row & video_only & long_, :].copy(),
    "AV_both_ge7p5": df.loc[is_fake_row & both_fake  & long_, :].copy(),
}

counts = {}
for name, dsub in subsets.items():
    path = os.path.join(out_dir, f"{name}.csv")
    dsub.to_csv(path, index=False)  # keep all columns
    counts[name] = len(dsub)

print("✅ Saved 6 fake-only subsets to:", out_dir)
for k, v in counts.items():
    print(f"  {k}: {v} rows")



holdout_counts = {}
for name, dsub in subsets.items():
    if len(dsub) >= 3:
        holdout = dsub.sample(frac=0.3, random_state=42)
    elif len(dsub) > 0:
        # tiny fallback to avoid empty holdouts
        holdout = dsub.sample(n=1, random_state=42)
    else:
        holdout = dsub  # empty

    holdout_path = os.path.join(out_dir, f"holdout_30pct_{name}.csv")
    holdout.to_csv(holdout_path, index=False)
    holdout_counts[name] = len(holdout)

print("✅ Saved 30% holdouts per subset:")
for k, v in holdout_counts.items():
    print(f"  holdout_30pct_{k}: {v} rows")


