# Load & structure metadata

In [2]:
import pandas as pd
import ast
import os

# --- Load Metadata CSV ---
metadata_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean/thesis_main_files/datasets/processed/csv_files/lav_df/metadata/metadata.csv'  # Adjust path as needed
df = pd.read_csv(metadata_path)

print(f"✅ Loaded metadata.csv with shape: {df.shape}")


✅ Loaded metadata.csv with shape: (136304, 13)


# Parse list columns

In [3]:
# --- Convert stringified lists into actual lists ---
list_columns = ['fake_periods', 'timestamps']

for col in list_columns:
    df[col] = df[col].apply(ast.literal_eval)

print(f"✅ Parsed list-like columns: {list_columns}")


✅ Parsed list-like columns: ['fake_periods', 'timestamps']


# Ensure correct data types

In [4]:
# --- Ensure boolean columns are proper booleans ---
df['modify_video'] = df['modify_video'].astype(bool)
df['modify_audio'] = df['modify_audio'].astype(bool)

# --- Confirm Column Types ---
print("✅ Column types after conversion:")
print(df.dtypes[['modify_video', 'modify_audio']])


✅ Column types after conversion:
modify_video    bool
modify_audio    bool
dtype: object


# Created new labels and columns

In [5]:
# Binary label for fake videos (1 = Fake, 0 = Real)
df['label'] = df['n_fakes'].apply(lambda x: 1 if x >= 1 else 0)
df['av_combo'] = df.apply(lambda row: f"A{int(row['modify_audio'])}_V{int(row['modify_video'])}", axis=1)
df['fake_segment_count'] = df['fake_periods'].apply(lambda x: len(x) if x else 0)
def compute_total_fake_length(fake_periods):
    if not fake_periods:
        return 0.0
    return sum(end - start for start, end in fake_periods)

df['total_fake_length'] = df['fake_periods'].apply(compute_total_fake_length)


# Quick verification

In [7]:
print(df[['file', 'n_fakes', 'label', 'av_combo', 'fake_segment_count', 'total_fake_length']].head())


         file  n_fakes  label av_combo  fake_segment_count  total_fake_length
0  000001.mp4        0      0    A0_V0                   0              0.000
1  000000.mp4        0      0    A0_V0                   0              0.000
2  000002.mp4        1      1    A1_V1                   1              0.724
3  000003.mp4        1      1    A0_V1                   1              0.280
4  000004.mp4        1      1    A1_V0                   1              0.704


# Save 70% of REAL sampled videos for training SSL 

In [19]:
# --- Parameters ---
output_sampled_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/possible_training_sample.csv'  # Adjust as needed

# --- Filter Real Videos ---
df_real = df[df['label'] == 0]

# --- Sample 70% Randomly ---
df_real_sampled = df_real.sample(frac=0.7, random_state=42)

# --- Save to CSV ---
os.makedirs(os.path.dirname(output_sampled_path), exist_ok=True)
df_real_sampled.to_csv(output_sampled_path, index=False)

print(f"✅ Saved sampled real videos (70% of {len(df_real)}) to {output_sampled_path}")


✅ Saved sampled real videos (70% of 36431) to /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/possible_training_sample.csv


# Save 70% of REAL sampled videos for training SSL 

In [21]:
import pandas as pd
imput_sampled_data = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/possible_training_sample.csv'  # Adjust as needed

df_real_sampled = pd.read_csv(imput_sampled_data)


# --- File paths for the two halves ---
half_1_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/sample_real_70_percent_half1.csv'
half_2_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/sample_real_70_percent_half2.csv'

# --- Calculate 50% of the sampled set ---
half_sample_size = len(df_real_sampled) // 2

# --- Shuffle & Split ---
df_real_sampled_shuffled = df_real_sampled.sample(frac=1.0, random_state=123).reset_index(drop=True)
df_half1 = df_real_sampled_shuffled.iloc[:half_sample_size]
df_half2 = df_real_sampled_shuffled.iloc[half_sample_size:]

# --- Save to CSVs ---
df_half1.to_csv(half_1_path, index=False)
df_half2.to_csv(half_2_path, index=False)

print(f"✅ Saved first 50% to: {half_1_path} ({len(df_half1)} rows)")
print(f"✅ Saved second 50% to: {half_2_path} ({len(df_half2)} rows)")

✅ Saved first 50% to: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/sample_real_70_percent_half1.csv (12751 rows)
✅ Saved second 50% to: /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/sample_real_70_percent_half2.csv (12751 rows)


# Save 30% hold-out set

In [24]:
# --- Path for holdout set ---
holdout_path = 'data/holdout_30_percent_for_training.csv'

# --- Ensure file column exists and is unique in df_real ---
assert df_real['file'].is_unique, "The 'file' column should be unique for this operation."

# --- Get 30% holdout set using 'file' to exclude sampled videos ---
df_real_holdout = df_real[~df_real['file'].isin(df_real_sampled['file'])]

# --- Sanity check ---
assert len(df_real_holdout) + len(df_real_sampled) == len(df_real), "Mismatch in split sizes!"

