# EmoCodes Validation
This notebook does the following quality checks and processing:
1. Checks for sensible onset/offset times and prints errors appropriately.
    * Offset must be later than onset (Datavyu also flags these)
    * Durations of 0 are not permitted
    * Offset cannot be after end of film clip
2. Converts the onset/offseft format to a timeseries for each label.
3. Resample each time series to 10Hz.
4. Save each resampled rating time series as a CSV.
5. Generate summary statistics and plots.

Soon, we will have a GUI for this process.

In [1]:
# import needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from os.path import abspath

code_file = '/Users/catcamacho/Box/Video Coding Materials/Dori/AHKJS1E2_objective_codes_DB.csv'
video_file = '/Users/catcamacho/Box/LCBD/ATV/atv_ratings_orig/AHKJ_S1E2.mp4'
codes_df = pd.read_csv(code_file, index_col=None)
sampling_rate = 10 #in Hz
interpolate_gaps = True
extract_luminance = True
outfile_type = 'csv'
outfile_name = 'codes_timeseries'

In [10]:
# get duration of video in milliseconds
def get_video_length(filename):
    from moviepy.editor import VideoFileClip
    
    clip = VideoFileClip(video_file)
    file_duration = int(clip.duration)*1000
    return(file_duration)

# extract the unique code names (assumes Datavyu CSV export format)
def get_code_labels(codes_df):
    labels = pd.Series(codes_df.columns).str.split(pat='.', expand=True)
    labels = list(labels[0].unique())
    for x in labels:
        if 'Unnamed' in x:
            labels.remove(x)
    return(labels)
    
# validate and convert onset/offset times to a timeseries
def validate_convert_timestamps(labels,codes_df, video_duration, sampling_rate):
    
    #set up dataframe object to store data
    timeseries_df = pd.DataFrame(columns=labels, index=range(int(1000/sampling_rate),video_duration+int(1000/sampling_rate),int(1000/sampling_rate)))
    timeseries_df.index.name='time'

    for label in labels:
        label_df = codes_df[[label+'.onset',label+'.offset',label+'.code01']].dropna(axis=0,how='any')

        # check if offsets precede onsets
        dur = label_df[label+'.offset'] - label_df[label+'.onset']
        for d in dur:
            if d<=0:
                raise ValueError("ERROR: code '{0}' has an offset time that is before the corresponding onset time.".format(label))

        # check that the offset for the last code is not after the end of the episode
        if label_df.loc[label_df.index[-1],label+'.offset'] > video_duration:
            print("Warning: The last offset for code '{0}' is after the end of the video. Changing to match video length.".format(label))
            label_df.loc[label_df.index[-1],label+'.offset'] = video_duration

        # add codes to the timeseries dataframe
        for x in label_df.index:
            onset = int(label_df.loc[x,label+'.onset'])
            offset = int(label_df.loc[x,label+'.offset'])
            timeseries_df.loc[onset:offset,label] = label_df.loc[x,label+'.code01']

        # check for gaps in the codes and interpolate if interp flag is set to True
        timeseries_df[label]=pd.to_numeric(timeseries_df[label],errors='ignore')
        nans = timeseries_df[label].isna()
        missing = sum(nans)*sampling_rate

        if missing > 0:
            if interpolate_gaps==True and (timeseries_df[label].dtype==float or timeseries_df[label].dtype==int):
                print("Warning: there are {0}ms of interpolated codes for '{1}'".format(missing,label))
            else:
                print("Warning: there are {0}ms of missing codes for '{1}'".format(missing,label))

    # interpolate gaps in codes if 'interpolate_gaps' is set to True
    if interpolate_gaps==True:
        timeseries_df=timeseries_df.interpolate(method='nearest',axis=0)
  
    return(timeseries_df)

def save_timeseries(timeseries_df,outfile_type,outfile_name):
     # get the date and time right now
    today = datetime.now()
    
    if outfile_type=='csv':
        # save as a csv
        timeseries_df.to_csv('{0}_{1}.csv'.format(outfile_name,today.strftime('%Y%m%d-%H%M%S')),na_rep='NA')
    elif outfile_type=='excel':
        # save as an excel file
        timeseries_df.to_excel('{0}_{1}.xlsx'.format(outfile_name,today.strftime('%Y%m%d-%H%M%S')),na_rep='NA')
    elif outfile_type=='tab':
        # save as a tab-delimited file
        timeseries_df.to_csv('{0}_{1}.txt'.format(outfile_name,today.strftime('%Y%m%d-%H%M%S')),sep='\t',na_rep='NA')
    elif outfile_type=='space':
        # save as a space-delimited file
        timeseries_df.to_csv('{0}_{1}.txt'.format(outfile_name,today.strftime('%Y%m%d-%H%M%S')),sep='  ',na_rep='NA')
    else:
        print('Warning: data note saved! Please indicate the file format from the following options: csv, excel, tab, space')
    
    filepath = abspath('{0}_{1}.txt'.format(outfile_name,today.strftime('%Y%m%d-%H%M%S')))
    print('Code timeseries saved at {0}'.format(filepath))
    
def compute_luminance(timeseries_df, video_file, sampling_rate, video_duration):
    
    
    return(timeseries_df)

In [11]:
video_duration = get_video_length(video_file)
labels = get_code_labels(codes_df)
timeseries_df = validate_convert_timestamps(labels, codes_df, video_duration, sampling_rate)
#save_timeseries(timeseries_df,outfile_type,outfile_name)



In [4]:
import cv2

video = cv2.VideoCapture(video_file)
frames_lum = []

end=False
while end==False:
    r,f = video.read()
    if r==1:
        t = f.mean(axis=0).mean(axis=0)
        lum = 0.299*t[0] + 0.587*t[1] + 0.114*t[2] # formula from https://www.w3.org/TR/AERT/#color-contrast
        frames_lum.append(lum)
    else:
        end=True

fps = (len(frames_lum)*1000)/video_duration
print(fps)

29.99610288386594


In [5]:
# convert frame number to time in ms
a = np.arange(1000/fps,(len(frames_lum)/fps)*1000,1000/fps)
b = pd.Series(frames_lum,index=pd.to_datetime(a,unit='ms'))
c = b.resample('{0}ms'.format(1000/sampling_rate)).mean()
print(len(b))
print(len(c))
print(c)

38485
12831
1970-01-01 00:00:00.000     85.407060
1970-01-01 00:00:00.100     90.147618
1970-01-01 00:00:00.200     95.225112
1970-01-01 00:00:00.300    102.543018
1970-01-01 00:00:00.400    107.673228
                              ...    
1970-01-01 00:21:22.600     48.840106
1970-01-01 00:21:22.700     49.577317
1970-01-01 00:21:22.800     49.673655
1970-01-01 00:21:22.900     50.015883
1970-01-01 00:21:23.000     50.443377
Freq: 100L, Length: 12831, dtype: float64


In [12]:
timeseries_df

Unnamed: 0_level_0,closeup,collective,has_body,has_faces,has_words,interacting_nonverbal,interacting_verbal,num_chars,phys_pain_object,phys_pain_other,scenenum,setting,time_of_day
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"stage, near docks",1.0
200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"stage, near docks",1.0
300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"stage, near docks",1.0
400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"stage, near docks",1.0
500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"stage, near docks",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1282600,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,18.0,airplane,1.0
1282700,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,18.0,airplane,1.0
1282800,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,18.0,airplane,1.0
1282900,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,18.0,airplane,1.0


In [None]:
# convert the index to seconds for ease of graphing
#timeseries_df.index = timeseries_df.index/1000

# determine which variables are of numeric typing for summary
for label in labels:
    if timeseries_df[label].dtype == object:
        labels.remove(label)

# make line plots for each numeric code
#height = len(labels)*1.5
#plt.figure()
#timeseries_df.plot.line(figsize=(12,height),title=labels,
#                        subplots=True, legend=False,xlim=(0,timeseries_df.index[-1]))
#plt.tight_layout()
#plt.show()

t = timeseries_df[labels]
plt.figure()
t.plot.pie(subplots=True, figsize=(15,15))
plt.show()