## Export clips

In [113]:
import pandas as pd
from datetime import datetime
import time
import os

date_format = '%Y-%m-%d %H:%M:%S.%f'

FILE_NAME = '757f7150-ba06-11eb-b0dd-42010a8a001e'

try:
    os.mkdir(f'{FILE_NAME}/clips')
except:
    pass


data = pd.read_csv(f'{FILE_NAME}/{FILE_NAME}.csv')

In [114]:
clips = []
for (index, row) in data.iterrows():
    start_time = (datetime.strptime(row['start_time'], date_format) - datetime.strptime(data.loc[0, 'start_time'], date_format)).total_seconds()

    start_time = time.strftime('%H:%M:%S', time.gmtime(start_time)) + '.0'
    duration = time.strftime('%H:%M:%S', time.gmtime(10)) + '.0'
    clips.append([start_time, duration])

In [115]:
clips

[['00:00:00.0', '00:00:10.0'],
 ['00:00:10.0', '00:00:10.0'],
 ['00:00:20.0', '00:00:10.0'],
 ['00:00:30.0', '00:00:10.0'],
 ['00:00:40.0', '00:00:10.0'],
 ['00:00:50.0', '00:00:10.0'],
 ['00:01:00.0', '00:00:10.0'],
 ['00:01:10.0', '00:00:10.0'],
 ['00:01:20.0', '00:00:10.0'],
 ['00:01:30.0', '00:00:10.0'],
 ['00:01:40.0', '00:00:10.0'],
 ['00:01:50.0', '00:00:10.0'],
 ['00:02:00.0', '00:00:10.0'],
 ['00:02:10.0', '00:00:10.0'],
 ['00:02:20.0', '00:00:10.0'],
 ['00:02:30.0', '00:00:10.0'],
 ['00:02:40.0', '00:00:10.0'],
 ['00:02:50.0', '00:00:10.0'],
 ['00:03:00.0', '00:00:10.0'],
 ['00:03:10.0', '00:00:10.0'],
 ['00:03:20.0', '00:00:10.0'],
 ['00:03:30.0', '00:00:10.0'],
 ['00:03:40.0', '00:00:10.0'],
 ['00:03:50.0', '00:00:10.0'],
 ['00:04:00.0', '00:00:10.0'],
 ['00:04:10.0', '00:00:10.0'],
 ['00:04:20.0', '00:00:10.0'],
 ['00:04:30.0', '00:00:10.0'],
 ['00:04:40.0', '00:00:10.0'],
 ['00:04:50.0', '00:00:10.0'],
 ['00:05:00.0', '00:00:10.0'],
 ['00:05:10.0', '00:00:10.0'],
 ['00:05

In [116]:
import subprocess
VIDEO_PATH = f'{FILE_NAME}/{FILE_NAME}.mp4'
for i, clip in enumerate(clips):
    cmd = f'ffmpeg.exe -ss {clip[0]} -i "{VIDEO_PATH}" -c copy -t {clip[1]} "{FILE_NAME}/clips/{i}.mp4"'
    subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE)

## Import labels and train

In [135]:
import json
import numpy as np

for i, FILE_NAME in enumerate(['d3cb0836-b96c-11eb-b7da-42010a8a001e', 'fd4241fe-b965-11eb-a55e-42010a8a001e', 'f17db828-b9fe-11eb-8d36-42010a8a001e', '757f7150-ba06-11eb-b0dd-42010a8a001e']):
    labels = np.zeros(len(os.listdir(f'{FILE_NAME}/ann')))
    for elem in os.listdir(f'{FILE_NAME}/ann'):
        with open(f'{FILE_NAME}/ann/{elem}', 'r') as f:
            json_data = json.loads(f.read())

        if len(json_data['tags']) != 0:
            if json_data['tags'][0]['name'] == 'highlight':
                labels[int(elem.split('.')[0])] = 1
            else:
                labels[int(elem.split('.')[0])] = 0
        else:
            labels[int(elem.split('.')[0])] = 0


    labels = labels.astype(int)
    if i == 0:
        data = pd.read_csv(f'{FILE_NAME}/{FILE_NAME}.csv')
        data['label'] = labels
    else:
        d = pd.read_csv(f'{FILE_NAME}/{FILE_NAME}.csv')
        d['label'] = labels
        
        data = pd.concat([data, d]).reset_index(drop = True)

In [136]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score

model = LogisticRegression(solver='lbfgs', max_iter=1000)

kf = KFold(n_splits = 5, shuffle = True, random_state = 444)

In [137]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [138]:
X = data.drop(columns = ['start_time', 'end_time', 'label'])
Y = data['label']

X.iloc[:] = sc.fit_transform(X)

In [139]:
from joblib import dump, load
dump(sc, 'standard_scaler.joblib')

['standard_scaler.joblib']

In [140]:
f1_scores = []
roc_auc_scores = []
for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]
    
    model.fit(X_tr, Y_tr)
    Y_pred = model.predict_proba(X_val)[:, 1]
    
    print('F1', f1_score(Y_val, Y_pred > 0.2), 'ROC', roc_auc_score(Y_val, Y_pred))
    f1_scores.append(f1_score(Y_val, Y_pred > 0.2))
    roc_auc_scores.append(roc_auc_score(Y_val, Y_pred))
print('MEAN F1', np.mean(f1_scores))
print('MEAN ROC', np.mean(roc_auc_scores))

F1 0.3673469387755102 ROC 0.8155885897950984
F1 0.4489795918367347 ROC 0.7888461538461539
F1 0.3934426229508197 ROC 0.7730823863636365
F1 0.49056603773584906 ROC 0.8344494047619047
F1 0.5098039215686274 ROC 0.8965116279069767
MEAN F1 0.4420278225735082
MEAN ROC 0.8216956325347541


In [132]:
model.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [133]:
feature_importance = pd.DataFrame([X.columns.tolist(), list(model.coef_.reshape(-1))]).T
feature_importance.columns = ['feature', 'weight']
feature_importance.sort_values(by = 'weight', ascending = False)

Unnamed: 0,feature,weight
4,sound_loudness,0.757314
13,Machine gun,0.486599
18,Laughter,0.475522
10,"Child speech, kid speaking",0.39897
11,Fusillade,0.376248
3,movement_amount,0.3076
17,Boom,0.240096
16,Artillery fire,0.198991
2,negative_message_count,0.1987
15,Thunderstorm,0.185229


In [134]:
from joblib import dump, load
dump(model, 'model.joblib')

['model.joblib']