# Data Preparation

In [1]:
import cv2     
import matplotlib.pyplot as plt   
%matplotlib inline
import pandas as pd
import numpy as np   
from sklearn.model_selection import train_test_split
import os
import pickle
import splitfolders as splitfolders
from tqdm import tqdm
import math
import random

### Split the videos into training and testing set

Split first so we don't get same videos in training and testing set or else the accuracy will be unrealistically  high

In [2]:
# split the files into training(80%)and testing set(20%)

input_folder = 'Anomaly_Dataset/Anomaly_Videos/'
output_folder = 'Anomaly_Dataset'
splitfolders.ratio(input_folder, output=output_folder, seed=1337, ratio=(.8,.0,.2), group_prefix=None) # default values

Copying files: 550 files [08:47,  1.04 files/s]


In [3]:
# get the names of all the files in the test, val and train folders

def getSetFiles(path):
    listOfFiles = list()
    for (dirpath, dirnames, filenames) in os.walk(path):
        for file in filenames:
            fullpath = os.path.join(dirpath,file)
            pathname = os.path.dirname(fullpath)
            foldername = os.path.basename(pathname)
            listOfFiles += [os.path.join(foldername,file)]
    return listOfFiles

### Label the videos and save them in a text file

In [4]:
# create a text file containing all the file names

def generateTextFile(folder,fileName):
    getSetFiles(folder)
    list1 = getSetFiles(folder)
    with open(fileName, "w+") as output:
        for elem in list1:
            output.write("%s\n" % elem)

In [5]:
generateTextFile("Anomaly_Dataset/test","Anomaly_Dataset/test.txt")
generateTextFile("Anomaly_Dataset/train","Anomaly_Dataset/train.txt")

In [6]:
# open the text file with file names

def openFile(file):
    file = open(file, "r")
    temp = file.read()
    items = temp.split('\n') # split each item
    return items

In [7]:
# create tag for the testing videos
df_test = pd.read_csv("Anomaly_Dataset/test.txt", sep='\\',header=None) 
df_test.columns=['category','video_name']
df_test

Unnamed: 0,category,video_name
0,Crime,Abuse007_x264.mp4
1,Crime,Abuse009_x264.mp4
2,Crime,Abuse010_x264.mp4
3,Crime,Abuse023_x264.mp4
4,Crime,Abuse024_x264.mp4
...,...,...
105,Normal,Normal_Videos_923_x264.mp4
106,Normal,Normal_Videos_926_x264.mp4
107,Normal,Normal_Videos_930_x264.mp4
108,Normal,Normal_Videos_940_x264.mp4


In [8]:
# create tag for the training videos

df_train = pd.read_csv("Anomaly_Dataset/train.txt", sep='\\',header=None) 
df_train.columns=['category','video_name']
df_train

Unnamed: 0,category,video_name
0,Crime,Abuse001_x264.mp4
1,Crime,Abuse002_x264.mp4
2,Crime,Abuse003_x264.mp4
3,Crime,Abuse004_x264.mp4
4,Crime,Abuse005_x264.mp4
...,...,...
435,Normal,Normal_Videos_937_x264.mp4
436,Normal,Normal_Videos_938_x264.mp4
437,Normal,Normal_Videos_939_x264.mp4
438,Normal,Normal_Videos_941_x264.mp4


### Extract frames from the videos and label them

In [9]:
## extract frames from training videos

for i in tqdm(range(df_train.shape[0])):
    count = 0
    folderName = df_train['category'][i]
    videoFile = df_train['video_name'][i]
    cap = cv2.VideoCapture('Anomaly_Dataset/train/'+folderName+'/'+videoFile)  # capturing the video from the given path
    frameRate = cap.get(5) #frame rate
    x=1

    while(cap.isOpened()):
        frameId = cap.get(1) #current frame number
        ret, frame = cap.read()
        if (ret != True):
            break
        if (frameId % math.floor(frameRate) == 0):
            # storing the frames in a new folder named train_img
            
            filename ="Anomaly_Dataset/train_img/" +df_train['category'][i]+'/'+ df_train['video_name'][i].split('.',1)[0] +"_frame%d.jpg" %count;count+=1 
            cv2.imwrite(filename, frame)
    cap.release()

100%|████████████████████████████████████████████████████████████████████████████████| 440/440 [12:00<00:00,  1.64s/it]


In [10]:
# label the training images extracted from the videos

DATADIR = "Anomaly_Dataset\\train_img"
CATEGORIES = ["Crime","Normal"]

train_img=[]

for category in tqdm(CATEGORIES):
    path = os.path.join(DATADIR,category)

    class_num = CATEGORIES.index(category)

    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path,img))
            
        #add the images to training set
        train_img.append([img_array,class_num])

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [23:11<00:00, 695.90s/it]


In [11]:
# extract frames from testing videos

for i in tqdm(range(df_test.shape[0])):
    count = 0
    folderName = df_test['category'][i]
    videoFile = df_test['video_name'][i]
    cap = cv2.VideoCapture('Anomaly_Dataset/test/'+folderName+'/'+videoFile)  # capturing the video from the given path
    frameRate = cap.get(5) #frame rate
    x=1

    while(cap.isOpened()):
        frameId = cap.get(1) #current frame number
        ret, frame = cap.read()
        if (ret != True):
            break
        if (frameId % math.floor(frameRate) == 0):
            # storing the frames in a new folder named train_img
            filename ="Anomaly_Dataset/test_img/" +df_test['category'][i]+'/'+ df_test['video_name'][i].split('.',1)[0] +"_frame%d.jpg" %count;count+=1 
            cv2.imwrite(filename, frame)
    cap.release()

100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [03:36<00:00,  1.97s/it]


In [12]:
# label the testing image file

DATADIR = "Anomaly_Dataset\\test_img"
CATEGORIES = ["Crime","Normal"]

test_img=[]

for category in tqdm(CATEGORIES):
    path = os.path.join(DATADIR,category)

    class_num = CATEGORIES.index(category)

    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path,img))

            
        #add the images to training set
        test_img.append([img_array,class_num])

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [06:53<00:00, 206.94s/it]


### Convert image files to numpy array

In [23]:
# seperate the image and label to new list and convert them to numpy array 
X_train = []
y_train = []

for frames,label in train_img:
    X_train.append(frames)
    y_train.append(label)
    
X_train = np.array(X_train)
y_train = np.array(y_train)

print("Shape of array X: ",X_train.shape)
print("Shape of array Y: ",y_train.shape)

Shape of array X:  (66033, 240, 320, 3)
Shape of array Y:  (66033,)


In [24]:
# seperate the image and label to new list and convert them to numpy array 
X_test = []
y_test = []

for features,label in test_img:
    X_test.append(features)
    y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)
print("Shape of array X_test: ",X_test.shape)
print("Shape of array Y_test: ",y_test.shape)

Shape of array X_test:  (18369, 240, 320, 3)
Shape of array Y_test:  (18369,)


### Export to pickle file

In [25]:

# save numpy array as npy file
from numpy import asarray
from numpy import save

save('x_train.npy', X_train)

save('y_train.npy', y_train)

save('x_test.npy', X_test)

save('y_test.npy', y_test)

