In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

df_scenes= pd.read_csv("scenes_data.tsv", sep="\t")
df_max_peak= pd.read_csv("max_peak_data.tsv", sep="\t")

# Subtract 1 to avoid including the first frame of the next scene
df_scenes["offset_frame"] = df_scenes.groupby("episode")["onset_frame"].shift(-1) - 1

# Fill end_frame for the last scene (no next onset) with the maximum frame number
episode_max_frames = df_max_peak.groupby("episode")["episode_frame"].max()
df_scenes["offset_frame"] = df_scenes.apply(
    lambda row: episode_max_frames[row["episode"]] if pd.isna(row["offset_frame"]) else row["offset_frame"],
    axis=1
)



In [2]:
# Ensure consistent and clean episode strings
df_max_peak["episode"] = df_max_peak["episode"].astype(str)
df_scenes["episode"] = df_scenes["episode"].astype(str)

# make sure episode_frame and onset_frame are integers
df_max_peak["episode_frame"] = df_max_peak["episode_frame"].astype(int)
df_scenes["onset_frame"] = df_scenes["onset_frame"].astype(int)
df_scenes["offset_frame"] = df_scenes["offset_frame"].astype(int)


In [3]:
# Ensure episode_frame and onset_frame are sorted
df_max_peak = df_max_peak.sort_values(by=["episode", "episode_frame"])
df_scenes = df_scenes.sort_values(by=["episode", "onset_frame"])

# Initialize an empty list to store results
results = []

# Iterate over each episode to merge
for episode in df_max_peak["episode"].unique():
    # Filter data for the current episode
    peaks_ep = df_max_peak[df_max_peak["episode"] == episode].copy()
    scenes_ep = df_scenes[df_scenes["episode"] == episode][
        ["scene_number", "onset_frame", "offset_frame", "global_scene_number"]
    ].copy()
    
    # Merge using merge_asof
    merged = pd.merge_asof(
        peaks_ep.sort_values("episode_frame"),
        scenes_ep.sort_values("onset_frame"),
        left_on="episode_frame",
        right_on="onset_frame",
        direction="backward"
    )

    # Filter merged results to keep only rows where episode_frame <= end_frame
    merged = merged[merged["episode_frame"] <= merged["offset_frame"]]
    
    # Append merged result to the results list
    results.append(merged)

# Combine all episode results into one DataFrame
df_result = pd.concat(results, ignore_index=True)


In [4]:
df_result

Unnamed: 0,h,w,p,episode,frame,episode_frame,scene_number,onset_frame,offset_frame,global_scene_number
0,215,359,1,friends_s01e01a,1,1,1.0,1.0,539.0,1.0
1,215,359,1,friends_s01e01a,2,2,1.0,1.0,539.0,1.0
2,215,359,1,friends_s01e01a,3,3,1.0,1.0,539.0,1.0
3,215,359,1,friends_s01e01a,4,4,1.0,1.0,539.0,1.0
4,215,359,1,friends_s01e01a,5,5,1.0,1.0,539.0,1.0
...,...,...,...,...,...,...,...,...,...,...
5957068,215,359,1,friends_s06e24b,6086685,20558,158.0,20359.0,20562.0,47659.0
5957069,215,359,1,friends_s06e24b,6086686,20559,158.0,20359.0,20562.0,47659.0
5957070,215,359,1,friends_s06e24b,6086687,20560,158.0,20359.0,20562.0,47659.0
5957071,215,359,1,friends_s06e24b,6086688,20561,158.0,20359.0,20562.0,47659.0


In [24]:
# Filter out frames that are outside the scene range
df_filtered = df_result[
    (df_result["episode_frame"] >= df_result["onset_frame"]) & 
    (df_result["episode_frame"] <= df_result["offset_frame"])
]

# Check how many rows are left (should match the total number of frames if everything is correct)
print(f"Number of correctly matched frames: {len(df_filtered)}")
print(f"Number of frames initially: {len(df_result)}")

# If the number of rows doesn't match, there are frames that are not correctly matched


Number of correctly matched frames: 5957073
Number of frames initially: 5957073


In [5]:
df_result.to_csv("Peak_scenes_merged.tsv", sep="\t", index=False)