In [None]:
import os
import glob

HOME_PATH = "C:/Users/Bhagyashree/Desktop/project/kidsguard-dataset/video_splits/"

In [None]:
frame_directories = []
for file in os.listdir("C:/Users/Bhagyashree/Desktop/project/kidsguard-dataset/videos/"):
    file = file.split('.')[0]
    frame_directories.append(os.path.join(HOME_PATH, file)) 

In [None]:
from PIL import Image
import numpy as np

def read_image(img_path):
    if os.path.isfile(img_path):
        img = Image.open(img_path)
        return np.asarray(img)
    else:
        return np.zeros((1,))

In [None]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [None]:
import re

def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [None]:
#VGG-19 is donwloaded
import torch
import torchvision.models as models
import torch.nn as nn

use_cuda = torch.cuda.is_available()

vgg19 = models.vgg19(pretrained=True)
layers = list(vgg19.features.children())
layers.append(nn.AdaptiveMaxPool2d(1))
modified_vgg19 = nn.Sequential(*layers)
for p in modified_vgg19.parameters():
    p.requires_grad = False
modified_vgg19.eval()
if use_cuda:
    modified_vgg19.cuda()
print(modified_vgg19)

In [None]:
# features are extracted from every frame
from torch.autograd import Variable
import torchvision.transforms as transforms

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
normalizer = transforms.Compose([transforms.ToTensor(), normalize])

def get_vgg_features_from_frame(frame_paths):
    tensor_list = []
    for frame_path in frame_paths:
        frame = read_image(frame_path)
        normalized_frame = normalizer(frame)
        normalized_frame = normalized_frame.unsqueeze(0)
        tensor_list.append(normalized_frame)
    frame_tensors = Variable(torch.cat(tensor_list, 0))
    if use_cuda:
        frame_tensors = frame_tensors.cuda()
    frame_features = modified_vgg19(frame_tensors)
    frame_features = frame_features.view(frame_features.shape[0], frame_features.shape[1])
    np_frame_features = frame_features.cpu().data.numpy()
    start = np_frame_features.shape[0]
    for i in range(start, 6):
        np_frame_features = np.insert(np_frame_features, i, 0, axis=0)
    return np_frame_features

In [None]:
#extracted features are saved into framefeatures.hdf5 file
import h5py

def save_checkpoint(frame_data, video_ids, path='C:/Users/Bhagyashree/Desktop/project/kidsguard-dataset/processed/aggregate_1_sec/frames_features.hdf5'):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with h5py.File(path, 'a', libver='latest') as f:
        frame_data = np.array(frame_data)
        video_ids = np.array(video_ids)
        
        try:
            frame_dset = f['frames']
            vids_dset = f['vids']
        except KeyError:
            frame_dset = f.create_dataset('frames', shape=(0, 6, 512), maxshape=(None, 6, 512), compression = 'gzip')
            vids_dset = f.create_dataset('vids', shape=(0, ), maxshape=(None, ), compression = 'gzip', dtype=h5py.special_dtype(vlen=str))
            f.swmr_mode = True

        new_frame_shape = frame_data.shape[0]
        new_vids_shape = video_ids.shape[0]
        
        frame_dset.resize(frame_dset.shape[0] + new_frame_shape, axis=0)
        vids_dset.resize(vids_dset.shape[0] + new_vids_shape, axis=0)
        
        frame_dset[-new_frame_shape:] = frame_data
        vids_dset[-new_vids_shape:] = video_ids
        print(frame_dset.shape)

In [None]:
#the model is navigated to every frame that is extracted from the video so that VGG-19 can extract the features of the frames
s=0
for directory in frame_directories:
    features = []
    vids = []
    frame_list=[]
    for root, directories, files in os.walk(directory, topdown=False):
        for name in files:
            frame_list.append(os.path.join(root, name))
        for name in directories:
            os.path.join(root, name)

    frame_files = natural_sort(frame_list)
    frame_files_per_second = list(chunks(frame_files, 6))
    ctr = 0
    for frames_per_second in frame_files_per_second:
        if len(frames_per_second) > 1:
            frame_features = get_vgg_features_from_frame(frames_per_second)
            features.append(frame_features)
            vids.append(directory.split(os.sep)[-1])
            ctr += 1
        save_checkpoint(features, vids)    

In [None]:
HOME_PATH = 'C:/Users/Bhagyashree/Desktop/project/kidsguard-dataset/'

ANNOTATION_PATH_SUFFIX = 'annotations/{0}.txt'
DATASET_PATH = 'processed/annotated_data.hdf5'

ANNOTATION_LABELS = {
    'none': 0,
    'violent' : 1,
    'sexual': 2,
    'both': 3
}

In [None]:
def read_hdf5(name, path=HOME_PATH + 'processed/aggregate_1_sec/frames_features.hdf5'):
    f = h5py.File(path, 'r')
    print(f)
    return f[name]

In [None]:
vids = read_hdf5('vids')
frames = read_hdf5('frames')

In [None]:
prev = vids[0]
vid_details = []

vid_dict = {}
vid_dict['start_index'] = 0
ctr = 0

for i in range(0, vids.shape[0]):
    vid = vids[i]
    ctr += 1
    if not prev == vid:
        vid_dict['vid'] = prev
        vid_dict['length'] = ctr
        vid_details.append(vid_dict)
        
        vid_dict = {}
        vid_dict['start_index'] = i
        ctr = 0
        prev = vid

In [None]:
#Annotations of every frame are created and stored into Annotation.hdf5 file
def save_checkpoint1(frame_data,annotations, video_ids):
    path=HOME_PATH+DATASET_PATH
    with h5py.File(path, 'a', libver='latest') as f:
        frame_data = np.array(frame_data)
        annotations = np.array(annotations)
        video_ids = np.array(video_ids)
        
        try:
            frame_dset = f['frames']
            annotation_dset = f['annotations']
            vids_dset = f['vids']
        except KeyError:
            frame_dset = f.create_dataset('frames', shape=(0, 6, 512), maxshape=(None, 6, 512), compression = 'gzip')
            annotation_dset = f.create_dataset('annotations', shape=(0, ), maxshape=(None,), compression = 'gzip')
            vids_dset = f.create_dataset('vids', shape=(0, ), maxshape=(None, ), compression = 'gzip', dtype=h5py.special_dtype(vlen=str))
            f.swmr_mode = True

        new_frame_shape = frame_data.shape[0]
        new_annotation_shape = annotations.shape[0]
        new_vids_shape = video_ids.shape[0]
        
        frame_dset.resize(frame_dset.shape[0] + new_frame_shape, axis=0)
        annotation_dset.resize(annotation_dset.shape[0] + new_annotation_shape, axis=0)
        vids_dset.resize(vids_dset.shape[0] + new_vids_shape, axis=0)
        
        frame_dset[-new_frame_shape:] = frame_data
        annotation_dset[-new_annotation_shape:] = annotations
        vids_dset[-new_vids_shape:] = video_ids

In [None]:
# Annotations of every frame are created
path1="C:/Users/Bhagyashree/Desktop/project/kidsguard-dataset/videos/Annotations/"
for file in os.listdir(path1):
    
    for detail in vid_details:
        vid = detail['vid']
        bad_annotation_ctr = 0
        frame_index = 0
        annotated_features = []
        annotations = []
        annotation_vids = []
        file_path=path1+file
        print(file_path)
        with open(file_path) as f:
            content = f.readlines()
            content = [x.strip() for x in content]
            for annotation in content:
                m = re.search('[^: ]+$', annotation)
                try:
                    annotated_features.append(detail['start_index'])
                    annotation_vids.append(vid)
                except KeyError:
                    bad_annotation_ctr += 1
                    pass
                frame_index += 1
        save_checkpoint1(frames,annotations,annotation_vids)
        assert len(annotated_features) + bad_annotation_ctr <= detail['length'] 


In [None]:
READ_DATASET = 'processed/aggregate_1_sec/frames_features.hdf5'

SECONDS_PER_CLIP = 3
NUM_CLASSES = 4

WRITE_DATASET = 'processed/aggregate_{0}_sec/frames_features.hdf5'.format(SECONDS_PER_CLIP)

In [None]:
path=HOME_PATH+READ_DATASET
def read_hdf5(name, path):
    f = h5py.File(path, 'r')
    return f[name][()]

In [None]:
frames = read_hdf5('frames',path)

In [None]:
reshaped_frame = np.reshape(frames, (int(frames.shape[0] / SECONDS_PER_CLIP), frames.shape[1] * SECONDS_PER_CLIP, frames.shape[2]))

In [None]:
def save_data(frames, path=HOME_PATH+WRITE_DATASET):
    if not os.path.exists(os.path.dirname(HOME_PATH+WRITE_DATASET)):
        os.makedirs(os.path.dirname(HOME_PATH+WRITE_DATASET))
    with h5py.File(path, 'w') as f:
        f.create_dataset('frames', data=frames, compression='gzip')

In [None]:
save_data(reshaped_frame)

In [None]:
READ_DATASET1 = 'processed/annotated_data.hdf5'

SECONDS_PER_CLIP = 3
NUM_CLASSES = 4

WRITE_DATASET1 = 'processed/aggregate_{0}_sec/unbalanced_data.hdf5'.format(SECONDS_PER_CLIP)

In [None]:
def read_hdf5_1(name, path=HOME_PATH + READ_DATASET1):
    f = h5py.File(path, 'r')
    print(f)
    return f[name][()]

In [None]:
frames = read_hdf5_1('frames')
annotations = read_hdf5_1('annotations')

In [None]:
reshaped_frames = np.reshape(frames, (int(frames.shape[0] / SECONDS_PER_CLIP), frames.shape[1] * SECONDS_PER_CLIP, frames.shape[2]))

In [None]:
reshaped_annotations = []
for i in range(0, annotations.shape[0], SECONDS_PER_CLIP):
    reshaped_annotations.append(np.bincount(annotations[i:i+SECONDS_PER_CLIP].astype(int)).argmax())

In [None]:
reshaped_annotations = np.array(reshaped_annotations)

In [None]:
label_check_ctr = [0 for i in range(NUM_CLASSES)]
for i in range(reshaped_annotations.shape[0]):
    label_check_ctr[int(reshaped_annotations[i])] += 1

In [None]:
#unbalanced_data.hdf5 file is created which has all the annotations 
def save_data1(frames, annotations, path=HOME_PATH+WRITE_DATASET1):
    if not os.path.exists(os.path.dirname(HOME_PATH+WRITE_DATASET1)):
        os.makedirs(os.path.dirname(HOME_PATH+WRITE_DATASET1))
    with h5py.File(path, 'w') as f:
        f.create_dataset('frames', data=frames, compression='gzip')
        f.create_dataset('annotations', data=annotations, compression='gzip')

In [None]:
save_data1(reshaped_frames, reshaped_annotations)

In [None]:
READ_DATASET_PATH = 'processed/aggregate_{0}_sec/unbalanced_data.hdf5'.format(SECONDS_PER_CLIP)
WRITE_SAFE_DATASET_PATH = 'processed/aggregate_{0}_sec/safe_data.hdf5'.format(SECONDS_PER_CLIP)
WRITE_EXPLICIT_DATASET_PATH = 'processed/aggregate_{0}_sec/explicit_data.hdf5'.format(SECONDS_PER_CLIP)

label_ctr = [12313, 6795, 2268, 3244]

In [None]:
def read_hdf5_2(name, path=HOME_PATH + READ_DATASET_PATH):
    f = h5py.File(path, 'r')
    print(f)
    return f[name][()]

In [None]:
frames = read_hdf5_2('frames')
annotations = read_hdf5_2('annotations')

In [None]:
def save_checkpoint2(frame_data, annotations, path):
    with h5py.File(path, 'a', libver='latest') as f:
        frame_data = np.array(frame_data)
        annotations = np.array(annotations)
        
        try:
            frame_dset = f['frames']
            annotation_dset = f['annotations']
        except KeyError:
            frame_dset = f.create_dataset('frames', shape=(0, 6*SECONDS_PER_CLIP, 512), maxshape=(None, 6*SECONDS_PER_CLIP, 512), compression = 'gzip')
            annotation_dset = f.create_dataset('annotations', shape=(0, ), maxshape=(None,), compression = 'gzip')
            f.swmr_mode = True

        new_frame_shape = frame_data.shape[0]
        new_annotation_shape = annotations.shape[0]
        
        frame_dset.resize(frame_dset.shape[0] + new_frame_shape, axis=0)
        annotation_dset.resize(annotation_dset.shape[0] + new_annotation_shape, axis=0)
        
        frame_dset[-new_frame_shape:] = frame_data
        annotation_dset[-new_annotation_shape:] = annotations
        print(frame_dset.shape)

In [None]:
#A safe_data.hdf5 and explicit_data.hdf5 file is created which is used to store the data which is further passed to the models
lvl_0_annotated_frames = []
lvl_0_annotation_labels = []

lvl_1_annotated_frames = []
lvl_1_annotation_labels = []

print(np.sum(label_ctr))
save_every = 1000
ctr = 0

rand_indices = np.random.permutation(frames.shape[0])

for rand_idx in rand_indices:
    if not np.any(label_ctr):
        break
    print("rand index {0}",rand_idx)
    label = int(annotations[rand_idx])
    if label_ctr[label] > 0:     
        ctr += 1
        if label == 0:
            lvl_0_annotated_frames.append(frames[rand_idx])
            lvl_0_annotation_labels.append(annotations[rand_idx])
        else:
            lvl_0_annotated_frames.append(frames[rand_idx])
            lvl_0_annotation_labels.append(1)
            
            lvl_1_annotated_frames.append(frames[rand_idx])
            lvl_1_annotation_labels.append(annotations[rand_idx]-1)
            
        if ctr % save_every == 0:
            save_checkpoint2(lvl_0_annotated_frames, lvl_0_annotation_labels, HOME_PATH+WRITE_SAFE_DATASET_PATH)
            save_checkpoint2(lvl_1_annotated_frames, lvl_1_annotation_labels, HOME_PATH+WRITE_EXPLICIT_DATASET_PATH)
            print(label_ctr)
            lvl_0_annotated_frames = []
            lvl_0_annotation_labels = []

            lvl_1_annotated_frames = []
            lvl_1_annotation_labels = []
            
save_checkpoint2(lvl_0_annotated_frames, lvl_0_annotation_labels, HOME_PATH+WRITE_SAFE_DATASET_PATH)
save_checkpoint2(lvl_1_annotated_frames, lvl_1_annotation_labels, HOME_PATH+WRITE_EXPLICIT_DATASET_PATH)

In [None]:
lvl0_annotations = read_hdf5('annotations', HOME_PATH+WRITE_SAFE_DATASET_PATH)
lvl1_annotations = read_hdf5('annotations', HOME_PATH+WRITE_EXPLICIT_DATASET_PATH)

label_check_ctr = [0, 0, 0, 0]
lvl1_ctr = 0
for i in range(lvl0_annotations.shape[0]):
    lvl0_label = int(lvl0_annotations[i])
    if lvl0_label == 0:
        label_check_ctr[lvl0_label] += 1
    else:
        lvl1_label = int(lvl1_annotations[lvl1_ctr]) + 1
        lvl1_ctr += 1
        label_check_ctr[lvl1_label] += 1
print(label_check_ctr)

In [None]:
READ_DATASET_PATH1 = 'processed/aggregate_{0}_sec/unbalanced_data.hdf5'.format(SECONDS_PER_CLIP)
WRITE_DATASET_PATH1 = 'processed/aggregate_{0}_sec/balanced_data.hdf5'.format(SECONDS_PER_CLIP)
label_ctr1 = [12313, 6795, 2268, 3244]

In [None]:
def read_hdf5_3(name, path=HOME_PATH + READ_DATASET_PATH):
    f = h5py.File(path, 'r+')
    return f[name][()]

In [None]:
frames = read_hdf5_3('frames')
annotations = read_hdf5_3('annotations')

In [None]:
label_check_ctr1 = [0, 0, 0, 0]
for i in range(annotations.shape[0]):
    label_check_ctr1[int(annotations[i])] += 1

In [None]:
#balanced_data.hdf5 file is created which has all the annotations in the proper format
def save_checkpoint3(frame_data, annotations):
    path=HOME_PATH+WRITE_DATASET_PATH
    with h5py.File(path, 'a', libver='latest') as f:
        frame_data = np.array(frame_data)
        annotations = np.array(annotations)
        
        try:
            frame_dset = f['frames']
            annotation_dset = f['annotations']
        except KeyError:
            frame_dset = f.create_dataset('frames', shape=(0, 6*SECONDS_PER_CLIP, 512), maxshape=(None, 6*SECONDS_PER_CLIP, 512), compression = 'gzip')
            annotation_dset = f.create_dataset('annotations', shape=(0, ), maxshape=(None,), compression = 'gzip')
            f.swmr_mode = True

        new_frame_shape = frame_data.shape[0]
        new_annotation_shape = annotations.shape[0]
        
        frame_dset.resize(frame_dset.shape[0] + new_frame_shape, axis=0)
        annotation_dset.resize(annotation_dset.shape[0] + new_annotation_shape, axis=0)
        
        frame_dset[-new_frame_shape:] = frame_data
        annotation_dset[-new_annotation_shape:] = annotations
        print(frame_dset.shape)

In [None]:
annotated_frames = []
annotation_labels = []

print(np.sum(label_ctr1))
save_every = 2000
ctr = 0
rand_indices = np.random.permutation(frames.shape[0])
for rand_idx in rand_indices:
    if not np.any(label_ctr):
        break
    label = int(annotations[rand_idx])
    if label_ctr1[label] > 0:     
        ctr += 1
        annotated_frames.append(frames[rand_idx])
        annotation_labels.append(annotations[rand_idx])
        label_ctr[label] = label_ctr[label] - 1
        if ctr % save_every == 0:
            save_checkpoint3(annotated_frames, annotation_labels)
            print(label_ctr)
            annotated_frames = []
            annotation_labels = []


In [None]:
SECONDS_PER_CLIP = 3
NUM_CLASSES = 3
PROMINENT_TEST_CLASS = 2
READ_DATASET_FILE2 = 'explicit_data'
READ_DATASET_PATH2 = 'processed/aggregate_{0}_sec/{1}.hdf5'.format(SECONDS_PER_CLIP, READ_DATASET_FILE2)
WRITE_TRAIN_DATASET_PATH2 = 'processed/aggregate_{0}_sec/train_balanced_data.hdf5'.format(SECONDS_PER_CLIP, READ_DATASET_FILE2)
WRITE_TEST_DATASET_PATH2 = 'processed/aggregate_{0}_sec/test_balanced_data.hdf5'.format(SECONDS_PER_CLIP, READ_DATASET_FILE2)

In [None]:
def read_hdf5_4(name, path=HOME_PATH+READ_DATASET_PATH2):
    f = h5py.File(path, 'r')
    return f[name][()]

In [None]:
frames = read_hdf5_4('frames')
annotations = read_hdf5_4('annotations')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def save_splits(frames, annotations, path):
    with h5py.File(path, 'w') as f:
        f.create_dataset('frames', data=frames, compression='gzip')
        f.create_dataset('annotations', data=annotations, compression='gzip')

In [None]:
frames_train, frames_test, annnotations_train, annotations_test = train_test_split(frames, annotations, test_size=0.2, random_state=42, shuffle=True, stratify=annotations)

In [None]:
label = [0 for i in range(NUM_CLASSES)]
for i in annotations.astype(int):
    label[i] += 1 
label

In [None]:
label_train = [0 for i in range(NUM_CLASSES)]
for i in annnotations_train.astype(int):
    label_train[i] += 1 
label_train

In [None]:
label_test = [0 for i in range(NUM_CLASSES)]
for i in annotations_test.astype(int):
    label_test[i] += 1 
label_test

In [None]:
#train_balanced_data.hdf5 and test_balanced_data.hdf5 file is created
save_splits(frames_train, annnotations_train, HOME_PATH+WRITE_TRAIN_DATASET_PATH2)
save_splits(frames_test, annotations_test, HOME_PATH+WRITE_TEST_DATASET_PATH2)