# Import filepaths

In [1]:
import os
import pandas as pd
import ast

# ---- Paths ----
metadata_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean/thesis_main_files/datasets/processed/csv_files/lav_df/metadata/metadata.csv'  # Adjust path as needed
out_dir = "/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video"       # separate folder for this notebook
os.makedirs(out_dir, exist_ok=True)

# ---- Load CSV ----
df = pd.read_csv(metadata_path)
print(f"‚úÖ Loaded metadata.csv with shape: {df.shape}")


‚úÖ Loaded metadata.csv with shape: (136304, 13)


# Index Columns

In [2]:
list_columns = ["fake_periods", "timestamps"]

def safe_literal_eval(x):
    if pd.isna(x):
        return []
    if isinstance(x, (list, tuple)):
        return x
    x = str(x).strip()
    if x in ("", "[]", "None", "nan"):
        return []
    try:
        return ast.literal_eval(x)
    except Exception:
        # If parsing fails, return empty list to avoid crashing
        return []

for col in list_columns:
    df[col] = df[col].apply(safe_literal_eval)

print("‚úÖ Parsed list-like columns:", list_columns)

‚úÖ Parsed list-like columns: ['fake_periods', 'timestamps']


# Ensure modify flags are boolean


In [3]:
df["modify_audio"] = df["modify_audio"].astype(bool)

# Ensure numeric fields are numeric
df["n_fakes"]  = pd.to_numeric(df["n_fakes"], errors="coerce").fillna(0).astype(int)
df["duration"] = pd.to_numeric(df["duration"], errors="coerce")

print("‚úÖ Dtypes set. Summary:")
print(df.dtypes[["modify_audio","modify_video","n_fakes","duration"]])


‚úÖ Dtypes set. Summary:
modify_audio       bool
modify_video       bool
n_fakes           int64
duration        float64
dtype: object


# Binary label: 1 = fake (>=1 segment), 0 = real


In [4]:
df["label"] = (df["n_fakes"] >= 1).astype(int)


# A/V combination label e.g. A1_V0

In [5]:
df["av_combo"] = df.apply(lambda r: f"A{int(r['modify_audio'])}_V{int(r['modify_video'])}", axis=1)

# Count of fake segments from fake_periods
df["fake_segment_count"] = df["fake_periods"].apply(lambda x: len(x) if x else 0)

# Total fake segment length (seconds)
def compute_total_fake_length(fake_periods):
    if not fake_periods:
        return 0.0
    return sum((end - start) for start, end in fake_periods)

df["total_fake_length"] = df["fake_periods"].apply(compute_total_fake_length)

print("‚úÖ Engineered columns added: label, av_combo, fake_segment_count, total_fake_length")
df[["file","n_fakes","label","av_combo","fake_segment_count","total_fake_length"]].head()


‚úÖ Engineered columns added: label, av_combo, fake_segment_count, total_fake_length


Unnamed: 0,file,n_fakes,label,av_combo,fake_segment_count,total_fake_length
0,000001.mp4,0,0,A0_V0,0,0.0
1,000000.mp4,0,0,A0_V0,0,0.0
2,000002.mp4,1,1,A1_V1,1,0.724
3,000003.mp4,1,1,A0_V1,1,0.28
4,000004.mp4,1,1,A1_V0,1,0.704


# Focus only on rows that contain fake segments

In [6]:
is_fake_row = df["n_fakes"] >= 1

# Modality masks

In [7]:
audio_only  = (df["modify_audio"] == True)  & (df["modify_video"] == False)
video_only  = (df["modify_audio"] == False) & (df["modify_video"] == True)
both_fake   = (df["modify_audio"] == True)  & (df["modify_video"] == True)

# Duration threshold and masks
thr = 7.5  # seconds
short = df["duration"] <  thr
long_ = df["duration"] >= thr

print("‚úÖ Masks prepared (fake-only, modality, duration).")

subsets = {
    # < 7.5s
    "A_only_lt7p5" : df.loc[is_fake_row & audio_only & short, :].copy(),
    "V_only_lt7p5" : df.loc[is_fake_row & video_only & short, :].copy(),
    "AV_both_lt7p5": df.loc[is_fake_row & both_fake  & short, :].copy(),
    # ‚â• 7.5s
    "A_only_ge7p5" : df.loc[is_fake_row & audio_only & long_, :].copy(),
    "V_only_ge7p5" : df.loc[is_fake_row & video_only & long_, :].copy(),
    "AV_both_ge7p5": df.loc[is_fake_row & both_fake  & long_, :].copy(),
}

counts = {}
for name, dsub in subsets.items():
    path = os.path.join(out_dir, f"{name}.csv")
    dsub.to_csv(path, index=False)  # keep all columns
    counts[name] = len(dsub)

print("‚úÖ Saved 6 fake-only subsets to:", out_dir)
for k, v in counts.items():
    print(f"  {k}: {v} rows")



holdout_counts = {}
for name, dsub in subsets.items():
    if len(dsub) >= 3:
        holdout = dsub.sample(frac=0.3, random_state=42)
    elif len(dsub) > 0:
        # tiny fallback to avoid empty holdouts
        holdout = dsub.sample(n=1, random_state=42)
    else:
        holdout = dsub  # empty

    holdout_path = os.path.join(out_dir, f"holdout_30pct_{name}.csv")
    holdout.to_csv(holdout_path, index=False)
    holdout_counts[name] = len(holdout)

print("‚úÖ Saved 30% holdouts per subset:")
for k, v in holdout_counts.items():
    print(f"  holdout_30pct_{k}: {v} rows")




‚úÖ Masks prepared (fake-only, modality, duration).
‚úÖ Saved 6 fake-only subsets to: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video
  A_only_lt7p5: 15578 rows
  V_only_lt7p5: 16699 rows
  AV_both_lt7p5: 15416 rows
  A_only_ge7p5: 17592 rows
  V_only_ge7p5: 16844 rows
  AV_both_ge7p5: 17744 rows
‚úÖ Saved 30% holdouts per subset:
  holdout_30pct_A_only_lt7p5: 4673 rows
  holdout_30pct_V_only_lt7p5: 5010 rows
  holdout_30pct_AV_both_lt7p5: 4625 rows
  holdout_30pct_A_only_ge7p5: 5278 rows
  holdout_30pct_V_only_ge7p5: 5053 rows
  holdout_30pct_AV_both_ge7p5: 5323 rows


# Extracted a total of 7500 for final EVAL of SSL trainer

## Import file paths

In [10]:
import os
import glob
import pandas as pd

# --- Parameters ---
input_root_dir = "/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files"  # CHANGE THIS to your folder with the CSVs
eval_subdir = "evaluate"
sample_size = 7500
random_seed = 42

# --- Ensure evaluate subfolder exists ---
eval_dir = os.path.join(input_root_dir, eval_subdir)
os.makedirs(eval_dir, exist_ok=True)

print(f"‚úì Input root: {input_root_dir}")
print(f"‚úì Evaluate out: {eval_dir}")


‚úì Input root: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files
‚úì Evaluate out: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate


## Sample 7,500 rows from each FAKE CSV subset} and save to evaluate/

In [11]:
csv_paths = sorted(glob.glob(os.path.join(input_root_dir, "*.csv")))
if not csv_paths:
    print("‚ö†Ô∏è No CSV files found in the input root directory.")
else:
    print(f"Found {len(csv_paths)} CSV files.")

summary = []
for src_path in csv_paths:
    # Skip any CSVs that are already inside evaluate/
    if os.path.dirname(src_path) == eval_dir:
        continue

    fname = os.path.basename(src_path)
    dst_path = os.path.join(eval_dir, fname)

    # Read source CSV
    df_src = pd.read_csv(src_path)

    # Sample up to sample_size rows (or all if fewer)
    n = min(sample_size, len(df_src))
    if n == 0:
        print(f"‚Äî Skipping empty CSV: {fname}")
        summary.append((fname, 0))
        continue

    df_sampled = df_src.sample(n=n, random_state=random_seed)
    df_sampled.to_csv(dst_path, index=False)

    print(f"‚úì {fname}: sampled {n} ‚Üí {dst_path}")
    summary.append((fname, n))

print("\nSummary:")
for name, n in summary:
    print(f"  {name}: {n} rows saved to evaluate/")


Found 6 CSV files.
‚úì AV_both_ge7p5.csv: sampled 7500 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/AV_both_ge7p5.csv
‚úì AV_both_lt7p5.csv: sampled 7500 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/AV_both_lt7p5.csv
‚úì A_only_ge7p5.csv: sampled 7500 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/A_only_ge7p5.csv
‚úì A_only_lt7p5.csv: sampled 7500 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/A_only_lt7p5.csv
‚úì V_only_ge7p5.csv: sampled 7500 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/V_on

## Sample 7,500 rows from each REAL hold-out CSV for each FAKE subset

### Import paths

In [12]:
import os
import glob
import pandas as pd

# --- Inputs ---
input_root_dir = "/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files"  # folder that contains evaluate/
evaluate_dir   = os.path.join(input_root_dir, "evaluate")
real_equiv_dir = os.path.join(evaluate_dir, "real_file_equivalent")

# Real holdout CSV (ABSOLUTE PATH you provided)
real_holdout_path = "/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/holdout_30_percent_for_training.csv"

# Sampling settings
target_per_file = 7500
random_seed     = 42
thr             = 7.5  # seconds

# --- Ensure output directory exists ---
os.makedirs(real_equiv_dir, exist_ok=True)

# --- Load REAL holdout once ---
df_real_holdout = pd.read_csv(real_holdout_path)
df_real_holdout["duration"] = pd.to_numeric(df_real_holdout["duration"], errors="coerce")

print(f"‚úÖ Loaded REAL holdout: {df_real_holdout.shape} from:\n{real_holdout_path}")
print(f"üìÅ Evaluate directory: {evaluate_dir}")
print(f"üìÅ Real equivalents will be saved to: {real_equiv_dir}")


‚úÖ Loaded REAL holdout: (10929, 17) from:
/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/holdout_30_percent_for_training.csv
üìÅ Evaluate directory: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate
üìÅ Real equivalents will be saved to: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/real_file_equivalent


### For each evaluate CSV, sample equivalent REAL rows (duration-matched)

In [13]:
def pick_duration_mask_from_filename(fname: str):
    """Return 'lt7p5' or 'ge7p5' if detectable from filename, else None."""
    low = "_lt7p5" in fname.lower()
    high = "_ge7p5" in fname.lower()
    if low and not high:
        return "lt7p5"
    if high and not low:
        return "ge7p5"
    return None

# Gather all evaluate CSVs (skip any in real_file_equivalent/)
eval_csvs = sorted(glob.glob(os.path.join(evaluate_dir, "*.csv")))
eval_csvs = [p for p in eval_csvs if os.path.dirname(p) != real_equiv_dir]

if not eval_csvs:
    print("‚ö†Ô∏è No evaluate CSVs found. Make sure you've created samples in the 'evaluate/' folder first.")
else:
    print(f"Found {len(eval_csvs)} evaluate CSVs.")

summary = []
for eval_path in eval_csvs:
    fname = os.path.basename(eval_path)
    real_out_path = os.path.join(real_equiv_dir, fname)

    # Read current evaluate CSV (FAKE subset)
    df_eval = pd.read_csv(eval_path)
    if df_eval.empty:
        print(f"‚Äî Skipping empty evaluate file: {fname}")
        summary.append((fname, 0, 0, "empty_eval"))
        continue

    # Determine duration bucket (prefer filename tag; fallback to data-driven inference)
    tag = pick_duration_mask_from_filename(fname)
    if tag is None:
        # Fallback: infer by majority of durations in fake sample
        df_eval["duration"] = pd.to_numeric(df_eval["duration"], errors="coerce")
        share_lt = (df_eval["duration"] < thr).mean()
        tag = "lt7p5" if share_lt >= 0.5 else "ge7p5"
        print(f"‚ÑπÔ∏è  Inferred duration tag for {fname} by data: {tag} (lt-share={share_lt:.2f})")

    # Build duration-matched pool from REAL holdout
    if tag == "lt7p5":
        df_real_pool = df_real_holdout[df_real_holdout["duration"] < thr]
    else:
        df_real_pool = df_real_holdout[df_real_holdout["duration"] >= thr]

    # Determine how many rows to sample: match evaluate count up to 7,500, but not exceeding real pool size
    n_eval = len(df_eval)
    n_target = min(target_per_file, n_eval, len(df_real_pool))

    if n_target == 0:
        print(f"‚ö†Ô∏è Not enough REAL rows to match for {fname} with tag {tag}. Skipping.")
        summary.append((fname, n_eval, 0, "no_real_available"))
        continue

    df_real_sample = df_real_pool.sample(n=n_target, random_state=42)
    df_real_sample.to_csv(real_out_path, index=False)

    print(f"‚úÖ {fname}: fake={n_eval}, real_sampled={n_target} ‚Üí {real_out_path}")
    summary.append((fname, n_eval, n_target, tag))

# Summary
print("\nüìä Matching summary (per evaluate file):")
for fname, n_eval, n_real, tag in summary:
    print(f"  {fname:40s}  | fake_rows={n_eval:6d} | real_rows={n_real:6d} | tag={tag}")


Found 6 evaluate CSVs.
‚úÖ AV_both_ge7p5.csv: fake=7500, real_sampled=3843 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/real_file_equivalent/AV_both_ge7p5.csv
‚úÖ AV_both_lt7p5.csv: fake=7500, real_sampled=7086 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/real_file_equivalent/AV_both_lt7p5.csv
‚úÖ A_only_ge7p5.csv: fake=7500, real_sampled=3843 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/real_file_equivalent/A_only_ge7p5.csv
‚úÖ A_only_lt7p5.csv: fake=7500, real_sampled=7086 ‚Üí /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/fake_files/evaluate/real_file_equivalent/A_only_lt7p5.csv
‚úÖ V_only_ge7p5.csv: fake=7500, 