# EmoCodes Validation
This notebook does the following quality checks and processing:
1. Checks for sensible onset/offset times and prints errors appropriately.
    * Offset must be later than onset (Datavyu also flags these)
    * Durations of 0 are not permitted
    * Offset cannot be after end of film clip
2. Converts the onset/offseft format to a timeseries for each label.
3. Resample each time series to 10Hz.
4. Save each resampled rating time series as a CSV.
5. Generate summary statistics and plots.

Soon, we will have a GUI for this process.

In [111]:
# import needed libraries
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

code_file = '/Users/catcamacho/Box/Video Coding Materials/Dori/AHKJS1E2_objective_codes_DB.csv'
video_file = '/Users/catcamacho/Box/LCBD/ATV/atv_ratings_orig/AHKJ_S1E2.mp4'
codes_df = pd.read_csv(code_file, index_col=None)
sampling_rate = 10 #in Hz
interpolate_gaps = True

In [120]:
# get duration of video in milliseconds
def get_video_length(filename):
    from moviepy.editor import VideoFileClip
    
    clip = VideoFileClip(video_file)
    file_duration = int(clip.duration*1000)
    return(file_duration)

# extract the unique code names (assumes Datavyu CSV export format)
def get_code_labels(codes_dataframe):
    labels = pd.Series(codes_dataframe.columns).str.split(pat='.', expand=True)
    labels = list(labels[0].unique())
    for x in labels:
        if 'Unnamed' in x:
            labels.remove(x)
    return(labels)
    
# validate and convert onset/offset times to a timeseries
def validate_convert_timestamps(labels,codes_df, video_duration, sampling_rate):
    
    #set up dataframe object to store data
    timeseries_df = pd.DataFrame(columns=labels, index=range(0,video_duration,int(1000/sampling_rate)))
    timeseries_df.index.name='time'

    for label in labels:
        label_df = codes_df[[label+'.onset',label+'.offset',label+'.code01']].dropna(axis=0,how='any')

        # check if offsets precede onsets
        dur = label_df[label+'.offset'] - label_df[label+'.onset']
        for d in dur:
            if d<=0:
                raise ValueError("ERROR: code '{0}' has an offset time that is before the corresponding onset time.".format(label))

        # check that the offset for the last code is not after the end of the episode
        if label_df.loc[label_df.index[-1],label+'.offset'] > video_duration:
            print("Warning: The last offset for code '{0}' is after the end of the video. Changing to match video length.".format(label))
            label_df.loc[label_df.index[-1],label+'.offset'] = video_duration

        # add codes to the timeseries dataframe
        for x in label_df.index:
            onset = int(label_df.loc[x,label+'.onset'])
            offset = int(label_df.loc[x,label+'.offset'])
            timeseries_df.loc[onset:offset,label] = label_df.loc[x,label+'.code01']

        # check for gaps in the codes and interpolate if interp flag is set to True
        timeseries_df[label]=pd.to_numeric(timeseries_df[label],errors='ignore')
        nans = timeseries_df[label].isna()
        missing = sum(nans)*sampling_rate

        if missing > 0:
            if interpolate_gaps==True and (timeseries_df[label].dtype==float or timeseries_df[label].dtype==int):
                print("Warning: there are {0}ms of interpolated codes for '{1}'".format(missing,label))
            else:
                print("Warning: there are {0}ms of missing codes for '{1}'".format(missing,label))

    # interpolate gaps in codes if 'interpolate_gaps' is set to True
    if interpolate_gaps==True:
        timeseries_df=timeseries_df.interpolate(method='nearest',axis=0)

    # get the date and time right now
    today = datetime.now()
    
    # save the series as a csv
    timeseries_df.to_csv('codes_timeseries_{0}.csv'.format(today.strftime('%Y%m%d-%H%M%S')),na_rep='NA')
    
    return(timeseries_df)
    

In [121]:
video_duration = get_video_length(video_file)
labels = get_code_labels(codes_df)
t = validate_convert_timestamps(labels,codes_df, video_duration, sampling_rate)



In [1]:
from PyQt5.QtWidgets import QApplication, QLabel
app = QApplication([])
label = QLabel('')
label.show()
app.exec_()

0