In [1]:
import os
import glob
import re
import datetime
import glob
from datetime import date, time, timedelta
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics
from itertools import chain
from cv2 import VideoCapture, CAP_PROP_FRAME_COUNT, CAP_PROP_FPS, CAP_PROP_POS_FRAMES
import ffmpeg
from imutils.video import FileVideoStream
import time
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import Span, DatetimeTicker, DatetimeTickFormatter
from scipy import stats
from utils import load_diffs, load_slide_changes, load_intervals
from utils import filter_video, plot_slide_diffs, get_signals, evaluate
from utils import sanitize_signals

sns.set()
output_notebook()

# Paths

In [2]:
meeting_id = 160320
#meeting_id = 220120
#meeting_id = 170127
#meeting_id = 83512718053

In [3]:
masked = True

In [4]:
video_path = glob.glob(f'zoom_data/{meeting_id}/*.mp4')[0]
print(video_path)
if masked:
    diff_path = f'diff_data/diffs_{meeting_id}_masked_cossim.csv'
else:
    diff_path = f'diff_data/diffs_{meeting_id}_cossim.csv'
sc_labels = f'slide_change_labels/{meeting_id}.csv'
interval_path = 'interval_data/intervals.csv'

zoom_data/160320/GMT20210614-160320_Recording_2020x1380.mp4


# Load FPS, Diffs, and Slide Change Labels

In [21]:
vidcap = VideoCapture(video_path)
fps = vidcap.get(CAP_PROP_FPS)
ddiffs = load_diffs(diff_path, fps)
sldf = load_slide_changes(sc_labels)
idf = load_intervals(interval_path, meeting_id=meeting_id)

# Visualize

In [22]:
q = None
ddiffs = filter_video(ddiffs, sldf, idf)
signals, threshold_q = get_signals(ddiffs, threshold_q=q)
print(f"Threshold is set at {threshold_q} percentile")

Threshold is set at 99.169921875 percentile


In [23]:
plot_slide_diffs(ddiffs, sldf=sldf, signals=signals)

# Debug

In [7]:
ddiffs

Unnamed: 0,meeting_id,elapsed_dt,cos_sim_diff
0,160320,1970-01-01 00:00:00,
1,160320,1970-01-01 00:00:01,0.000000e+00
2,160320,1970-01-01 00:00:02,3.792219e-05
3,160320,1970-01-01 00:00:03,0.000000e+00
4,160320,1970-01-01 00:00:04,0.000000e+00
...,...,...,...
2043,160320,1970-01-01 00:34:03,7.380545e-07
2044,160320,1970-01-01 00:34:04,3.186563e-05
2045,160320,1970-01-01 00:34:05,4.945095e-09
2046,160320,1970-01-01 00:34:06,2.570800e-01


In [8]:
signals

Unnamed: 0,elapsed_dt,signal
0,1970-01-01 00:00:00,False
1,1970-01-01 00:00:01,False
2,1970-01-01 00:00:02,True
3,1970-01-01 00:00:03,False
4,1970-01-01 00:00:04,False
...,...,...
2043,1970-01-01 00:34:03,True
2044,1970-01-01 00:34:04,True
2045,1970-01-01 00:34:05,False
2046,1970-01-01 00:34:06,True


In [54]:
results = ddiffs[['elapsed_dt', 'cos_sim_diff']] \
                    .merge(sldf[['change_time_dt']],
                           how='left',
                           left_on='elapsed_dt',
                           right_on='change_time_dt') \
                    .merge(signals, 
                           how='left', 
                           on='elapsed_dt')
results['plus'] = results.change_time_dt.shift()
results['minus'] = results.change_time_dt.shift(-1)
results['signal_to_change_time_dt'] = np.where(results.change_time_dt.notna(),
                                               results.change_time_dt,
                                               results[['plus', 'minus']].max(axis=1))
results

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt
0,1970-01-01 00:00:00,,NaT,False,NaT,NaT,NaT
1,1970-01-01 00:00:01,0.000000e+00,NaT,False,NaT,NaT,NaT
2,1970-01-01 00:00:02,3.792219e-05,NaT,True,NaT,NaT,NaT
3,1970-01-01 00:00:03,0.000000e+00,NaT,False,NaT,NaT,NaT
4,1970-01-01 00:00:04,0.000000e+00,NaT,False,NaT,NaT,NaT
...,...,...,...,...,...,...,...
2043,1970-01-01 00:34:03,7.380545e-07,NaT,True,NaT,NaT,NaT
2044,1970-01-01 00:34:04,3.186563e-05,NaT,True,NaT,NaT,NaT
2045,1970-01-01 00:34:05,4.945095e-09,NaT,False,NaT,NaT,NaT
2046,1970-01-01 00:34:06,2.570800e-01,NaT,True,NaT,1970-01-01 00:34:07,1970-01-01 00:34:07


In [55]:
maxes = results.groupby(['signal', 'signal_to_change_time_dt'], dropna=True, as_index=False) \
                .cos_sim_diff \
                .max() \
                .rename(columns={'cos_sim_diff': 'max_diff'})
results = results.merge(maxes, how='left', on=['signal','signal_to_change_time_dt'])
results.max_diff = np.where(results.max_diff.isna(), results.cos_sim_diff, results.max_diff)
results['keep_signal'] = results.max_diff == results.cos_sim_diff
results = results.query('keep_signal').reset_index(drop=True)

In [56]:
results

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal
0,1970-01-01 00:00:01,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
1,1970-01-01 00:00:02,3.792219e-05,NaT,True,NaT,NaT,NaT,3.792219e-05,True
2,1970-01-01 00:00:03,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
3,1970-01-01 00:00:04,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
4,1970-01-01 00:00:05,1.190978e-05,NaT,True,NaT,NaT,NaT,1.190978e-05,True
...,...,...,...,...,...,...,...,...,...
2038,1970-01-01 00:34:03,7.380545e-07,NaT,True,NaT,NaT,NaT,7.380545e-07,True
2039,1970-01-01 00:34:04,3.186563e-05,NaT,True,NaT,NaT,NaT,3.186563e-05,True
2040,1970-01-01 00:34:05,4.945095e-09,NaT,False,NaT,NaT,NaT,4.945095e-09,True
2041,1970-01-01 00:34:06,2.570800e-01,NaT,True,NaT,1970-01-01 00:34:07,1970-01-01 00:34:07,2.570800e-01,True


In [66]:
results.iloc[1937:1945, :].reset_index(drop=True)

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal
0,1970-01-01 00:32:22,0.0,NaT,False,NaT,NaT,NaT,0.0,True
1,1970-01-01 00:32:23,1.114587e-05,NaT,True,NaT,NaT,NaT,1.114587e-05,True
2,1970-01-01 00:32:24,0.0,NaT,False,NaT,NaT,NaT,0.0,True
3,1970-01-01 00:32:25,0.003283643,NaT,True,NaT,1970-01-01 00:32:26,1970-01-01 00:32:26,0.003283643,True
4,1970-01-01 00:32:26,0.0,1970-01-01 00:32:26,False,NaT,NaT,1970-01-01 00:32:26,0.0,True
5,1970-01-01 00:32:27,0.0,NaT,False,1970-01-01 00:32:26,NaT,1970-01-01 00:32:26,0.0,True
6,1970-01-01 00:32:28,5.88416e-07,NaT,True,NaT,NaT,NaT,5.88416e-07,True
7,1970-01-01 00:32:29,0.0,NaT,False,NaT,NaT,NaT,0.0,True


In [67]:
results = results.iloc[1937:1945, :].reset_index(drop=True)

In [68]:
results

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal
0,1970-01-01 00:32:22,0.0,NaT,False,NaT,NaT,NaT,0.0,True
1,1970-01-01 00:32:23,1.114587e-05,NaT,True,NaT,NaT,NaT,1.114587e-05,True
2,1970-01-01 00:32:24,0.0,NaT,False,NaT,NaT,NaT,0.0,True
3,1970-01-01 00:32:25,0.003283643,NaT,True,NaT,1970-01-01 00:32:26,1970-01-01 00:32:26,0.003283643,True
4,1970-01-01 00:32:26,0.0,1970-01-01 00:32:26,False,NaT,NaT,1970-01-01 00:32:26,0.0,True
5,1970-01-01 00:32:27,0.0,NaT,False,1970-01-01 00:32:26,NaT,1970-01-01 00:32:26,0.0,True
6,1970-01-01 00:32:28,5.88416e-07,NaT,True,NaT,NaT,NaT,5.88416e-07,True
7,1970-01-01 00:32:29,0.0,NaT,False,NaT,NaT,NaT,0.0,True


In [69]:
results[['elapsed_dt', 'change_time_dt']] \
            .merge(results[['elapsed_dt', 'signal_to_change_time_dt', 'signal']], 
                   how='left', left_on=['elapsed_dt', 'change_time_dt'], right_on=['elapsed_dt', 'signal_to_change_time_dt'])

Unnamed: 0,elapsed_dt,change_time_dt,signal_to_change_time_dt,signal
0,1970-01-01 00:32:22,NaT,NaT,False
1,1970-01-01 00:32:23,NaT,NaT,True
2,1970-01-01 00:32:24,NaT,NaT,False
3,1970-01-01 00:32:25,NaT,NaT,
4,1970-01-01 00:32:26,1970-01-01 00:32:26,1970-01-01 00:32:26,False
5,1970-01-01 00:32:27,NaT,NaT,
6,1970-01-01 00:32:28,NaT,NaT,True
7,1970-01-01 00:32:29,NaT,NaT,False


In [40]:
labels = results[['elapsed_dt', 'change_time_dt']] \
            .merge(results.loc[results.signal, ['signal_to_change_time_dt', 'signal']], 
                   how='left', left_on='elapsed_dt', right_on='signal_to_change_time_dt')
labels

Unnamed: 0,elapsed_dt,change_time_dt,signal_to_change_time_dt,signal
0,1970-01-01 00:00:01,NaT,NaT,
1,1970-01-01 00:00:02,NaT,NaT,
2,1970-01-01 00:00:03,NaT,NaT,
3,1970-01-01 00:00:04,NaT,NaT,
4,1970-01-01 00:00:05,NaT,NaT,
...,...,...,...,...
2038,1970-01-01 00:34:03,NaT,NaT,
2039,1970-01-01 00:34:04,NaT,NaT,
2040,1970-01-01 00:34:05,NaT,NaT,
2041,1970-01-01 00:34:06,NaT,NaT,


In [22]:
labels['y'] = labels.change_time_dt.notna().astype(int)
labels['yhat'] = (labels.signal_to_change_time_dt.notna() & labels.signal).astype(int)

In [23]:
scores = {}
scores['accuracy'] = sklearn.metrics.accuracy_score(labels.y, labels.yhat)
scores['precision'] = sklearn.metrics.precision_score(labels.y, labels.yhat)
scores['recall'] = sklearn.metrics.recall_score(labels.y, labels.yhat)
scores['f1'] = sklearn.metrics.f1_score(labels.y, labels.yhat)

In [24]:
scores

{'accuracy': 0.9999980007357292,
 'precision': 1.0,
 'recall': 0.8888888888888888,
 'f1': 0.9411764705882353}

In [25]:
labels.yhat.sum()

16

In [18]:
signals.query('signal')

Unnamed: 0,elapsed_dt,signal
2,1970-01-01 00:00:02,True
5,1970-01-01 00:00:05,True
8,1970-01-01 00:00:08,True
10,1970-01-01 00:00:10,True
16,1970-01-01 00:00:16,True
...,...,...
2036,1970-01-01 00:33:56,True
2039,1970-01-01 00:33:59,True
2043,1970-01-01 00:34:03,True
2044,1970-01-01 00:34:04,True


In [24]:
signals.shape

(2048, 2)

In [25]:
signals.signal.sum()

16

In [26]:
(~signals.signal).sum()

2032

In [27]:
sanitized_signals = sanitize_signals(ddiffs, sldf, signals)

In [28]:
sanitized_signals.shape

(2048, 9)

In [29]:
sanitized_signals.signal.sum()

16

In [30]:
(~sanitized_signals.signal).sum()

2032

In [31]:
sanitized_signals

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal
0,1970-01-01 00:00:00,,NaT,False,NaT,NaT,NaT,,True
1,1970-01-01 00:00:01,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
2,1970-01-01 00:00:02,3.792219e-05,NaT,False,NaT,NaT,NaT,3.792219e-05,True
3,1970-01-01 00:00:03,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
4,1970-01-01 00:00:04,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True
...,...,...,...,...,...,...,...,...,...
2043,1970-01-01 00:34:03,7.380545e-07,NaT,False,NaT,NaT,NaT,7.380545e-07,True
2044,1970-01-01 00:34:04,3.186563e-05,NaT,False,NaT,NaT,NaT,3.186563e-05,True
2045,1970-01-01 00:34:05,4.945095e-09,NaT,False,NaT,NaT,NaT,4.945095e-09,True
2046,1970-01-01 00:34:06,2.570800e-01,NaT,True,NaT,1970-01-01 00:34:07,1970-01-01 00:34:07,2.570800e-01,True


In [33]:
sanitized_signals.query('signal_to_change_time_dt.notna()')

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal
81,1970-01-01 00:01:21,0.05583079,NaT,True,NaT,1970-01-01 00:01:22,1970-01-01 00:01:22,0.05583079,True
82,1970-01-01 00:01:22,0.0,1970-01-01 00:01:22,False,NaT,NaT,1970-01-01 00:01:22,0.0,True
83,1970-01-01 00:01:23,0.0,NaT,False,1970-01-01 00:01:22,NaT,1970-01-01 00:01:22,0.0,True
115,1970-01-01 00:01:55,0.05720391,NaT,True,NaT,1970-01-01 00:01:56,1970-01-01 00:01:56,0.05720391,True
116,1970-01-01 00:01:56,0.0,1970-01-01 00:01:56,False,NaT,NaT,1970-01-01 00:01:56,0.0,True
117,1970-01-01 00:01:57,0.0,NaT,False,1970-01-01 00:01:56,NaT,1970-01-01 00:01:56,0.0,True
120,1970-01-01 00:02:00,0.02884515,NaT,True,NaT,1970-01-01 00:02:01,1970-01-01 00:02:01,0.02884515,True
121,1970-01-01 00:02:01,0.0,1970-01-01 00:02:01,False,NaT,NaT,1970-01-01 00:02:01,0.0,True
122,1970-01-01 00:02:02,0.0,NaT,False,1970-01-01 00:02:01,NaT,1970-01-01 00:02:01,0.0,True
213,1970-01-01 00:03:33,0.0,NaT,False,NaT,1970-01-01 00:03:34,1970-01-01 00:03:34,0.0,True


In [39]:
sanitized_signals['true_pos'] = sanitized_signals.signal & sanitized_signals[['change_time_dt', 'plus', 'minus']].notna().sum(axis=1) > 0
sanitized_signals

Unnamed: 0,elapsed_dt,cos_sim_diff,change_time_dt,signal,plus,minus,signal_to_change_time_dt,max_diff,keep_signal,true_pos
0,1970-01-01 00:00:00,,NaT,False,NaT,NaT,NaT,,True,False
1,1970-01-01 00:00:01,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True,False
2,1970-01-01 00:00:02,3.792219e-05,NaT,False,NaT,NaT,NaT,3.792219e-05,True,False
3,1970-01-01 00:00:03,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True,False
4,1970-01-01 00:00:04,0.000000e+00,NaT,False,NaT,NaT,NaT,0.000000e+00,True,False
...,...,...,...,...,...,...,...,...,...,...
2043,1970-01-01 00:34:03,7.380545e-07,NaT,False,NaT,NaT,NaT,7.380545e-07,True,False
2044,1970-01-01 00:34:04,3.186563e-05,NaT,False,NaT,NaT,NaT,3.186563e-05,True,False
2045,1970-01-01 00:34:05,4.945095e-09,NaT,False,NaT,NaT,NaT,4.945095e-09,True,False
2046,1970-01-01 00:34:06,2.570800e-01,NaT,True,NaT,1970-01-01 00:34:07,1970-01-01 00:34:07,2.570800e-01,True,True


In [47]:
# TP + FN
num_slide_changes = sanitized_signals.change_time_dt.notna().sum()
num_slide_changes

18

In [52]:
# TN + FP
num_non_slide_changes = sanitized_signals.shape[0] - num_slide_changes
num_non_slide_changes

2030

In [48]:
# TP + FP
num_signals = sanitized_signals.signal.sum()
num_signals

16

In [49]:
# TP
tp = sanitized_signals.true_pos.sum()
tp

15

In [50]:
# FP
fp = num_signals - tp
fp

1

In [51]:
# FN
fn = num_slide_changes - tp
fn

3

In [53]:
# TN
tn = num_non_slide_changes - fp
tn

2029

In [54]:
accuracy = (tp + tn) / sanitized_signals.shape[0]
accuracy

0.998046875

In [42]:
precision = tp / num_signals
precision

0.9375

In [43]:
recall = tp / num_slide_changes
recall

0.8333333333333334

In [44]:
f1 = 2 * (precision * recall) / (precision + recall)
f1

0.8823529411764706

In [18]:
mask = sanitized_signals.signal
sanitized_signals.loc[mask, ['signal_to_change_time_dt']]

Unnamed: 0,signal_to_change_time_dt
2,NaT
5,NaT
8,NaT
10,NaT
16,NaT
...,...
2036,NaT
2039,NaT
2043,NaT
2044,NaT


In [19]:
sanitized_signals[['elapsed_dt', 'change_time_dt']] \
                .merge(sanitized_signals.loc[mask, ['signal_to_change_time_dt']],
                       how='left', left_on='elapsed_dt', right_on='signal_to_change_time_dt')

Unnamed: 0,elapsed_dt,change_time_dt,signal_to_change_time_dt
0,1970-01-01 00:00:01,NaT,NaT
1,1970-01-01 00:00:02,NaT,NaT
2,1970-01-01 00:00:03,NaT,NaT
3,1970-01-01 00:00:04,NaT,NaT
4,1970-01-01 00:00:05,NaT,NaT
...,...,...,...
2038,1970-01-01 00:34:03,NaT,NaT
2039,1970-01-01 00:34:04,NaT,NaT
2040,1970-01-01 00:34:05,NaT,NaT
2041,1970-01-01 00:34:06,NaT,NaT


In [23]:
mask = sanitized_signals.signal
labels = sanitized_signals[['elapsed_dt', 'signal', 'change_time_dt']] \
                .merge(sanitized_signals.loc[mask, ['signal_to_change_time_dt']],
                       how='left', left_on='elapsed_dt', right_on='signal_to_change_time_dt')
labels

Unnamed: 0,elapsed_dt,signal,change_time_dt,signal_to_change_time_dt
0,1970-01-01 00:00:01,False,NaT,NaT
1,1970-01-01 00:00:02,True,NaT,NaT
2,1970-01-01 00:00:03,False,NaT,NaT
3,1970-01-01 00:00:04,False,NaT,NaT
4,1970-01-01 00:00:05,True,NaT,NaT
...,...,...,...,...
2038,1970-01-01 00:34:03,True,NaT,NaT
2039,1970-01-01 00:34:04,True,NaT,NaT
2040,1970-01-01 00:34:05,False,NaT,NaT
2041,1970-01-01 00:34:06,True,NaT,NaT


In [27]:
num_slide_changes = labels.change_time_dt.notna().sum()
num_slide_changes

18

In [28]:
num_signals = labels.signal.sum()
num_signals

510

In [29]:
true_pos = labels.signal_to_change_time_dt.notna().sum()
true_pos

16

In [55]:
sanitized_signals.query('signal')

Unnamed: 0,elapsed_dt,change_time_dt,signal,signal_to_change_time_dt
81,1970-01-01 00:01:21,NaT,True,1970-01-01 00:01:22
115,1970-01-01 00:01:55,NaT,True,1970-01-01 00:01:56
120,1970-01-01 00:02:00,NaT,True,1970-01-01 00:02:01
214,1970-01-01 00:03:34,1970-01-01 00:03:34,True,1970-01-01 00:03:34
398,1970-01-01 00:06:38,NaT,True,1970-01-01 00:06:39
443,1970-01-01 00:07:23,NaT,True,1970-01-01 00:07:24
542,1970-01-01 00:09:02,1970-01-01 00:09:02,True,1970-01-01 00:09:02
610,1970-01-01 00:10:10,NaT,True,1970-01-01 00:10:11
766,1970-01-01 00:12:46,NaT,True,1970-01-01 00:12:47
917,1970-01-01 00:15:17,1970-01-01 00:15:17,True,1970-01-01 00:15:17
