In [1]:
import os
import pandas as pd
import numpy as np

from managedata.videos_duplicates import (find_duplicates, compare_with_idvideos)

In [2]:
# parameters
main_fields = ['label','ID','valid_frames']
segments_fields=['frame_start','frame_end','bad_properties']

# set the limit to decide that two videos are the same or different
limit_dissimilarity = 5

# data folder
folder_data = '/home/anaflo/MDMC/thesis/sensorium/data/'

# metadata folder
folder_meta = os.path.join('..','metadata')

# results folder
folder_results = os.path.join('..','intermediate_results')

# mice/recording folders
all_recording_folders = [
    'dynamic29156-11-10-Video-8744edeac3b4d1ce16b680916b5267ce',
    'dynamic29228-2-10-Video-8744edeac3b4d1ce16b680916b5267ce',
    'dynamic29234-6-9-Video-8744edeac3b4d1ce16b680916b5267ce',
    'dynamic29513-3-5-Video-8744edeac3b4d1ce16b680916b5267ce',
    'dynamic29514-2-9-Video-8744edeac3b4d1ce16b680916b5267ce',
]

# path to global videos metadata
path_to_globalmeta = os.path.join(folder_meta, "global_meta", "videos")
if not os.path.exists(path_to_globalmeta):
        os.makedirs(path_to_globalmeta)

    

In [3]:
# Load the classification tables for all recordings
videos_df = []
for recording_folder in all_recording_folders:
    
    path_to_results = os.path.join(folder_results, recording_folder)

    df = pd.read_csv(os.path.join(path_to_results, "videos_classification_table.csv"), index_col=0)
    df.insert(0, "recording", [recording_folder]*len(df))

    if len(videos_df)==0:
        videos_df = df.copy()
    else:
        videos_df = pd.concat([videos_df, df], axis=0)
videos_df['trial'] = videos_df['trial'].astype(str)   
videos_df["ID"] = None

In [4]:

for recording_folder in all_recording_folders:

    print("\n==========================================")
    print(f"Computing for recording {recording_folder[0:18]}...")
    
    path_to_data = os.path.join(folder_data, recording_folder)
    path_to_meta = os.path.join(folder_meta, recording_folder)
    path_to_results_metavideos = os.path.join(folder_results, recording_folder, "meta_videos")


    vdf_rec = videos_df[(videos_df['recording']==recording_folder)]
    all_labels = list(set(vdf_rec['label'].to_list()))

    for thelabel in all_labels:

        print(f"\nLabel {thelabel}")
        print('-------------------')

        # find the trials belonging to the label
        vdf_rec_lab = videos_df[(videos_df['recording']==recording_folder) & (videos_df['label']==thelabel)]
        trials = vdf_rec_lab['trial'].to_list()

        # find duplicates
        list_distint_videos = find_duplicates(trials, 
                                              path_to_data, path_to_results_metavideos, 
                                              limit_dissimilarity=limit_dissimilarity)

        # compare each of them with the videos already identified for other recordings
        new_ids = compare_with_idvideos(thelabel, list_distint_videos, 
                                        path_to_data, path_to_results_metavideos, path_to_globalmeta, 
                                        limit_dissimilarity=limit_dissimilarity)

        # add the info to the trials table
        for i, duplicate_trials in enumerate(list_distint_videos):
            mask = (
                (videos_df["recording"] == recording_folder) &
                (videos_df["label"] == thelabel) &
                videos_df["trial"].isin(duplicate_trials) 
            )
            if np.sum(mask)!=len(duplicate_trials):
                raise Exception("Not all trials were found in the table")
            videos_df.loc[mask,"ID"] = new_ids[i] 


    # save the trials metadata
    folder_recording_meta = os.path.join(folder_meta, recording_folder)
    if not os.path.exists(folder_recording_meta):
        os.makedirs(folder_recording_meta)

    df_meta_trials_rec = videos_df[videos_df['recording']==recording_folder].copy()
    df_meta_trials_rec['valid_trial'] = df_meta_trials_rec['segments_bad_n']==0 
    df_meta_trials_rec = df_meta_trials_rec[['label','ID','trial','trial_type','valid_frames','valid_trial']]
    
    filename = os.path.join(folder_recording_meta,f"meta-trials_{recording_folder}.csv")
    df_meta_trials_rec.to_csv(filename, index=False)



Computing for recording dynamic29156-11-10...

Label PinkNoise
-------------------
Looking for duplicates...


  0%|          | 0/49 [00:00<?, ?it/s]

100%|██████████| 49/49 [00:06<00:00,  7.34it/s]


Comparing with existing ID videos...


100%|██████████| 5/5 [00:04<00:00,  1.23it/s]



Label GaussianDot
-------------------
Looking for duplicates...


100%|██████████| 59/59 [00:10<00:00,  5.38it/s]


Comparing with existing ID videos...


100%|██████████| 6/6 [00:05<00:00,  1.19it/s]



Label RandomDots
-------------------
Looking for duplicates...


100%|██████████| 79/79 [00:14<00:00,  5.60it/s]


Comparing with existing ID videos...


100%|██████████| 8/8 [00:06<00:00,  1.21it/s]



Label NaturalVideo
-------------------
Looking for duplicates...


  3%|▎         | 18/533 [00:25<12:22,  1.44s/it]


KeyboardInterrupt: 