# Load & structure metadata

In [1]:
import pandas as pd
import ast
import os

# --- Load Metadata CSV ---
metadata_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/metadata/metadata.csv'  # Adjust path as needed
df = pd.read_csv(metadata_path)

print(f"✅ Loaded metadata.csv with shape: {df.shape}")


✅ Loaded metadata.csv with shape: (136304, 13)


# Parse list columns

In [2]:
# --- Convert stringified lists into actual lists ---
list_columns = ['fake_periods', 'timestamps']

for col in list_columns:
    df[col] = df[col].apply(ast.literal_eval)

print(f"✅ Parsed list-like columns: {list_columns}")


✅ Parsed list-like columns: ['fake_periods', 'timestamps']


# Ensure correct data types

In [3]:
# --- Ensure boolean columns are proper booleans ---
df['modify_video'] = df['modify_video'].astype(bool)
df['modify_audio'] = df['modify_audio'].astype(bool)

# --- Confirm Column Types ---
print("✅ Column types after conversion:")
print(df.dtypes[['modify_video', 'modify_audio']])


✅ Column types after conversion:
modify_video    bool
modify_audio    bool
dtype: object


# Created new labels and columns

In [5]:
# Binary label for fake videos (1 = Fake, 0 = Real)
df['label'] = df['n_fakes'].apply(lambda x: 1 if x >= 1 else 0)
df['av_combo'] = df.apply(lambda row: f"A{int(row['modify_audio'])}_V{int(row['modify_video'])}", axis=1)
df['fake_segment_count'] = df['fake_periods'].apply(lambda x: len(x) if x else 0)
def compute_total_fake_length(fake_periods):
    if not fake_periods:
        return 0.0
    return sum(end - start for start, end in fake_periods)

df['total_fake_length'] = df['fake_periods'].apply(compute_total_fake_length)


# Quick verification

In [6]:
print(df[['file', 'n_fakes', 'modified', 'av_combo', 'fake_segment_count', 'total_fake_length']].head())


         file  n_fakes  modified av_combo  fake_segment_count  \
0  000001.mp4        0         0    A0_V0                   0   
1  000000.mp4        0         0    A0_V0                   0   
2  000002.mp4        1         1    A1_V1                   1   
3  000003.mp4        1         1    A0_V1                   1   
4  000004.mp4        1         1    A1_V0                   1   

   total_fake_length  
0              0.000  
1              0.000  
2              0.724  
3              0.280  
4              0.704  


# Save 70% of REAL sampled videos for training SSL 

In [7]:
# --- Parameters ---
output_sampled_path = '/Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/data/possible_training_sample.csv'  # Adjust as needed

# --- Filter Real Videos ---
df_real = df[df['modified'] == 0]

# --- Sample 70% Randomly ---
df_real_sampled = df_real.sample(frac=0.7, random_state=42)

# --- Save to CSV ---
os.makedirs(os.path.dirname(output_sampled_path), exist_ok=True)
df_real_sampled.to_csv(output_sampled_path, index=False)

print(f"✅ Saved sampled real videos (70% of {len(df_real)}) to {output_sampled_path}")


✅ Saved sampled real videos (70% of 36431) to /Users/abhishekgupte_macbookpro/PycharmProjects/project_combined_repo_clean_preprocessing/files/csv_files/processed/video/data/possible_training_sample.csv
