## Exploratory Data Analysis (EDA)

### Import Required Libraries.

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing import image
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg
from basic_image_eda import BasicImageEDA
import csv
import pandas as pd


DATA_PATH = "\\Users\\bchau\\Projects\\Thesis\\dataset"

In [None]:
pip install opencv-python

In [None]:
# get all video frame files.
SUPPORTED_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list =[]
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in SUPPORTED_EXTENSIONS):
                file_list.append(os.path.join(root, filename))
    return file_list

In [None]:
# Let's have a look on total number of frame in dataset
filenames = sorted(get_file_list(DATA_PATH))
print('Tolat number of video frames in CDNET DATASET -',len(filenames))

#### plot images

In [None]:
# Helper function to get the class name and file name
def classname_filename(str):
   # print(str)
   # print(str.split('\\'))
    return str.split('\\')[-2] + '/' + str.split('/')[-1]
        

In [None]:
# Let's have a look in few dataset video frames.
def plot_images(filenames):
    images = []
    for filename in filenames:
        images.append(mpimg.imread(filename))
        plt.figure(figsize=(5, 5))
    for index, image in enumerate(images):
        ax = plt.subplots()
        plt.title(classname_filename(filenames[index]))
        plt.imshow(image)

In [None]:
plot_images(filenames[20:30])

In [None]:
# Defining a method to do a basic EDA on each category of CDNET dataset.
def image_EDA(category):
    data_path = DATA_PATH + '/' + category
    extensions = ['png', 'jpg', 'jpeg']
    threads = 0
    dimension_plot = True
    channel_hist = True
    nonzero = False
    hw_division_factor = 1.0
    #BasicImageEDA.explore(data_path, extensions, threads, dimension_plot, channel_hist, nonzero, hw_division_factor)

In [None]:
# Method for extract data and write a csv file for EDA
root_dir = DATA_PATH 
def extract_eda_stats():
    categories = list(filter(lambda file : file != '.DS_Store',os.listdir(DATA_PATH)))
    header = ['category', 'sub_category', 'input', 'ground_truth']
    # state file name
    filename = "cdnet.csv"
    with open(filename, 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)
        for category_index, category in enumerate(categories):
            path = os.path.join(root_dir, category)
            sub_category = list(filter(lambda file : file != '.DS_Store',os.listdir(path)))
            for sub_category_index, input_ground in enumerate(sub_category):
                data_path = os.path.join(path, input_ground)
                input_ground = list(filter(lambda file : file != '.DS_Store',os.listdir(data_path)))
                input_path   = os.path.join(data_path, input_ground[0])
                groundtruth_path   = os.path.join(data_path, input_ground[1])
                no_of_input = len(next(os.walk(input_path))[2])
                no_of_groundtruth = len(next(os.walk(groundtruth_path))[2])
                data = [categories[category_index], sub_category[sub_category_index], no_of_input, no_of_groundtruth]
                # write the data
                writer.writerow(data)

In [None]:
# Let's have a look on data and it's category in dataset
extract_eda_stats()
file_name = 'cdnet.csv'
data_path = os.path.join(os.curdir, file_name)
print(data_path)
data = pd.read_csv(data_path)
category = data['category'].unique()
print('Total number of category',len(category))

#### CDNET dataset having 11 category in it.

All the category have input frames and it's corponding groundtruth. So, Let's explore how many frames in each category and there groundtruth.

### Let's explore the data by using some plots

In [None]:
# Defining a method to plot the graph
def plot(input_groundtruth):
    data_frame = data.groupby('category')[input_groundtruth].sum().sort_values(ascending=False).to_frame()
    plt.figure(num=None, figsize=(15, 5))
    bar=sns.barplot(x='category', y=input_groundtruth, data=data_frame.reset_index())
    bar.set(xlabel='Categories', ylabel='Number of frames')
    bar.set_xticklabels(bar.get_xticklabels(), rotation=90)
    bar.set_title('CDNET DATASET')

In [None]:
# Total number of input image frames in each category.
plot('input')

In [None]:
# Total number of groundtruth image frames in each category.
plot('ground_truth')

CDNET data set have 11 category and each and every category have sub category in it. So let's plot the all sub category together.

In [None]:
plt.figure(num=None, figsize=(25, 25))
data.plot(x='sub_category',
        kind='bar',
        stacked=False,
        title='CDNET data set all sub category frames distribution',
        figsize=(15, 7),xlabel='all sub categories', ylabel='Number of frames')

In [None]:
# Defining a method to plot all subcategory for each and every category.
def plat_category_data(category):
    category_frame = data[data['category'] == category]
    category_frame.plot(x='sub_category',
        kind='bar',
        stacked=False,
        title=category,
        figsize=(10, 5),xlabel='sub categories', ylabel='Number of frames')

In [None]:
# category wise input and groundtruth frames 
categories = data['category'].unique()
category_1 = categories[0]
plat_category_data(category_1)

In [None]:
print('Basic EDA on category : ' + category_1)
image_EDA(category_1)

In [None]:
category_2 = categories[1]
plat_category_data(category_2)

In [None]:
print('Basic EDA on category : ' + category_2)
image_EDA(category_2)

In [None]:
category_3 = categories[2]
plat_category_data(category_3)

In [None]:
print('Basic EDA on category : ' + category_3)
image_EDA(category_3)

In [None]:
category_4 = categories[3]
plat_category_data(category_4)

In [None]:
print('Basic EDA on category : ' + category_4)
image_EDA(category_4)

In [None]:
category_5 = categories[4]
plat_category_data(category_5)

In [None]:
print('Basic EDA on category : ' + category_5)
image_EDA(category_5)

In [None]:
category_6 = categories[5]
plat_category_data(category_6)

In [None]:
print('Basic EDA on category : ' + category_6)
image_EDA(category_6)

In [None]:
category_7 = categories[6]
plat_category_data(category_7)

In [None]:
print('Basic EDA on category : ' + category_7)
image_EDA(category_7)

In [None]:
category_8 = categories[7]
plat_category_data(category_8)

In [None]:
print('Basic EDA on category : ' + category_8)
image_EDA(category_8)

In [None]:
category_9 = categories[8]
plat_category_data(category_9)

In [None]:
print('Basic EDA on category : ' + category_9)
image_EDA(category_9)

In [None]:
category_10 = categories[9]
plat_category_data(category_10)

In [None]:
print('Basic EDA on category : ' + category_10)
image_EDA(category_10)

In [None]:
category_11 = categories[10]
plat_category_data(category_11)

In [None]:
print('Basic EDA on category : ' + category_11)
image_EDA(category_11)