In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import gc  # Import garbage collector

def create_high_quality_collage(video_path, collage_path, num_key_frames=80, resize_factor=1.0, final_resize_factor=1.0):
    try:
        # Load the video file
        video = cv2.VideoCapture(video_path)

        frames = []
        success, frame = video.read()

        # Read each frame from the video
        while success:
            frames.append(frame)
            success, frame = video.read()

        # Number of frames in the video
        num_frames = len(frames)
        print(f'Number of frames in the video: {num_frames}')

        # Select key frames evenly spaced
        frame_indices = np.linspace(0, num_frames - 1, num_key_frames, dtype=int)
        key_frames = [frames[i] for i in frame_indices]

        # Calculate the size of the collage
        rows = int(np.sqrt(num_key_frames))
        cols = (num_key_frames // rows) + (num_key_frames % rows > 0)

        if resize_factor is not None and resize_factor > 0:
            # Resize frames if resize_factor is provided
            key_frames_resized = [cv2.resize(frame, (0, 0), fx=resize_factor, fy=resize_factor, interpolation=cv2.INTER_CUBIC) for frame in key_frames]
        else:
            key_frames_resized = key_frames

        # Get frame dimensions
        frame_height, frame_width, _ = key_frames_resized[0].shape

        # Create a blank canvas for the collage
        collage_height = rows * frame_height
        collage_width = cols * frame_width
        collage = np.zeros((collage_height, collage_width, 3), dtype=np.uint8)

        # Place each frame into the collage
        for i in range(num_key_frames):
            row = i // cols
            col = i % cols
            collage[row * frame_height:(row + 1) * frame_height, col * frame_width:(col + 1) * frame_width, :] = key_frames_resized[i]

        # Resize the final collage to a smaller dimension
        final_collage = cv2.resize(collage, (0, 0), fx=final_resize_factor, fy=final_resize_factor, interpolation=cv2.INTER_CUBIC)

        # Save the collage image with maximum JPEG quality
        cv2.imwrite(collage_path, final_collage, [int(cv2.IMWRITE_JPEG_QUALITY), 100])
        
    finally:
        # Release memory
        del frames
        del key_frames
        del key_frames_resized
        del collage
        del final_collage
        gc.collect()

def create_prompt(row):
    video_title = row['video_title']
    transcript = row['transcriptions']
    prompt = f"""The image consists of frames from a TikTok video presented as a collage.
            Here is the title of the video: '{video_title}'
            Here is the transcript of the video: '{transcript}'
            Create a description of what is shown in the video and what the video is about.
            Write the description as a short summary with a maximum of 200 words, without using bullet points. """ 
    return prompt

# Directories for input videos and output collages
path = ""
input_dir = path + "NEW_Hatebase_dataset_downloaded_videos/"
output_dir = path +"FINAL_Hatebase_dataset_collages/"
os.makedirs(output_dir, exist_ok=True)

# Read the Excel file
file_path = path + "NEW_IMSyPP_EN_YouTube_comments_evaluation_context_643_PREPROCESSED_no_restricted_videos_5_additional_features_2kToken.xlsx"
df = pd.read_excel(file_path)
df = df#[:5]

# Ensure the necessary columns exist
if 'video_title' not in df.columns or 'transcriptions' not in df.columns or 'video_id' not in df.columns:
    raise ValueError("The Excel sheet must contain columns named 'video_title', 'transcriptions', and 'video_id'")

# Dictionary to store prompts by video ID
prompts = {}

# Generate prompts and update DataFrame
for index, row in df.iterrows():
    video_id = row['video_id']
    if video_id not in prompts:
        prompts[video_id] = create_prompt(row)
    df.at[index, 'prompt_video_desc'] = prompts[video_id]

# Initialize columns for video and collage filenames
df['video_file'] = ""
df['collage_file'] = ""

# List to store metadata
metadata = []

# Iterate over all MP4 files in the input directory
for video_file in os.listdir(input_dir):
    if video_file.endswith('.mp4'):
        video_path = os.path.join(input_dir, video_file)
        video_base_name = os.path.splitext(video_file)[0]
        collage_name = f'collage_{video_base_name}.jpg'
        collage_path = os.path.join(output_dir, collage_name)
        
        # Create the collage
        try:
            create_high_quality_collage(video_path, collage_path, num_key_frames=80, resize_factor=1.0, final_resize_factor=1.0)
            print(f"Collage created for {video_file}")

            # Add metadata
            metadata.append({
                'video_file': video_file,
                'collage_file': collage_path,
                'num_key_frames': 80,
                'resize_factor': 1.0,
                'final_resize_factor': 1.0
            })

            # Debugging output
            print(f"Updating DataFrame for {video_base_name}")
            
            # Add collage and video file names to the DataFrame
            df.loc[df['video_id'].str.contains(video_base_name, case=False, na=False), 'collage_file'] = collage_name
            df.loc[df['video_id'].str.contains(video_base_name, case=False, na=False), 'video_file'] = video_file

        except Exception as e:
            print(f"Failed to create collage for {video_file}: {e}")

        # Save intermediate DataFrame and metadata
        intermediate_df_path = path + f"intermediate_df_{video_base_name}.xlsx"
        df.to_excel(intermediate_df_path, index=False)
        intermediate_metadata_path = path + f"intermediate_metadata_{video_base_name}.xlsx"
        metadata_df = pd.DataFrame(metadata)
        metadata_df.to_excel(intermediate_metadata_path, index=False)

# Save final metadata to Excel
metadata_df = pd.DataFrame(metadata)
metadata_df.to_excel(path + 'collage_metadata_Hatebase_dataset_NEW2.xlsx', index=False)

# Save the updated DataFrame back to Excel
df.to_excel(path + "NEW_IMSyPP_EN_YouTube_comments_evaluation_context_643_PREPROCESSED_no_restricted_videos_5_additional_features_2kToken_w_prompt_ab644.xlsx", index=False)

print("Collages have been created, metadata has been saved to 'collage_metadata_Hatebase_dataset_NEW.xlsx', and the Excel file has been updated with the new columns.")
