In [1]:
root_folder = '1_video_input/'
output_csv = 'video_statistics_manipulated.csv'
#output_csv = 'video_statistics_original.csv'



import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import re

def extract_filename(file_path):
    # filename pattern (e.g., "002_006")
    match = re.search(r'([^/]+)\.mp4$', file_path)
    if match:
        return match.group(1)
    else:
        return None

def check_sequence_type(s):
    pattern_fake = r'^\d{3}_\d{3}$'  # Pattern for "manipulated" (e.g., "002_006")
    pattern_real = r'^\d{3}$'         # Pattern for "original" (e.g., "002")
    if re.match(pattern_fake, s):
        return "manipulated"
    elif re.match(pattern_real, s):
        return "original"
    else:
        return "invalid"
    
def get_video_info(file_path):
    try:
        cap = cv2.VideoCapture(file_path)
        if not cap.isOpened():
            raise Exception(f"Cannot open video file {file_path}")
        
        filename = extract_filename(file_path)
        sequence_type = check_sequence_type(filename)
        fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps if fps > 0 else None

        cap.release()
        return filename, sequence_type, fps, duration, width, height
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return file_path, None, None, None, None, None

def get_videos_info(root_folder):
    video_info_list = []
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            if file.endswith(('.mp4', '.mkv', '.avi', '.mov')):
                file_path = os.path.join(root, file)
                video_info = get_video_info(file_path)
                video_info_list.append(video_info)
    return video_info_list

def create_csv(video_info_list, output_csv):
    df = pd.DataFrame(video_info_list, columns=['Filename', 'Type', 'FPS', 'Duration',  'Width', 'Height'])
    df.to_csv(output_csv, index=False)

def plot_statistics(df):
    df = df.dropna() 

    # Duration boxplot
    plt.figure(figsize=(10, 6))
    plt.subplot(2, 2, 1)
    plt.boxplot(df['Duration'])
    plt.title('Duration (seconds)')

    # Width boxplot
    plt.subplot(2, 2, 2)
    plt.boxplot(df['Width'])
    plt.title('Width (pixels)')

    # Height boxplot
    plt.subplot(2, 2, 3)
    plt.boxplot(df['Height'])
    plt.title('Height (pixels)')

    # FPS boxplot
    plt.subplot(2, 2, 4)
    plt.boxplot(df['FPS'])
    plt.title('FPS')

    plt.tight_layout()
    plt.show()

    # Calculate and print statistics
    stats = {
        'Mean': df.mean(),
        'Standard Deviation': df.std(),
        'Minimum': df.min(),
        'Maximum': df.max()
    }
    stats_df = pd.DataFrame(stats)
    print(stats_df)

video_info_list = get_videos_info(root_folder)
create_csv(video_info_list, output_csv)
df = pd.read_csv(output_csv)
#plot_statistics(df)
