## Required Modules

In [21]:
import os
from zipfile import ZipFile
import pickle
import bz2
import json

import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, Activation
from keras.layers import Dropout
from keras import regularizers
from keras import optimizers
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from keras.utils import to_categorical

ModuleNotFoundError: No module named 'keras'

## Set Global Filepaths and Methods

In [17]:
# expected file paths
DATA_ROOT_DIRECTORY: str = 'data'
COMPRESSED_DATA_DIRECTORY_PATH: str = os.path.join(DATA_ROOT_DIRECTORY, 'compressed')
COMPRESSED_MANIFEST_FILE_PATH: str = os.path.join(COMPRESSED_DATA_DIRECTORY_PATH, 'shipnet_manifest.pbz2')
COMPRESSED_IMAGES_FILE_PATH: str = os.path.join(COMPRESSED_DATA_DIRECTORY_PATH, 'shipnet_images.zip')

# file paths to create
UNCOMPRESSED_DATA_DIRECTORY_PATH: str = os.path.join(DATA_ROOT_DIRECTORY, 'uncompressed')
UNCOMPRESSED_MANIFEST_FILE_PATH: str = os.path.join(UNCOMPRESSED_DATA_DIRECTORY_PATH, 'shipnet.json')
UNCOMPRESSED_IMAGES_DIRECTORY_PATH: str = os.path.join(UNCOMPRESSED_DATA_DIRECTORY_PATH, 'shipnet')
    
def get_full_path(local_path: str):
    """
    Gets the full path from the local path.

    :param local_path: a string pointing to a path local to this Notebook
    :returns: a string pointing to the full path corresponding to the provided local path
    """
    return os.path.join(os.getcwd(), local_path)

def raise_file_not_found_error_for_path(expected_object: str, local_path: str):
    """
    Raises a descriptive file not found error using the provided

    :param expected_object: a string fragment describing the content of the expected file
    :param local_path: the path that was not found
    :return: None.
    :raises FileNotFoundError
    """
    raise FileNotFoundError(f'Could not find {expected_object} at the specified path: {get_full_path(local_path)}')

## Unzipping Data 

We need to compress the data in order to store it in GitHub. 

We will use the following modules:
 - __zipfile__ to decompress the archive of the training images
 - __pickle__ and __bz2__ to decompress the JSON object containing the manifest of images and their lables
 - __os__ to handle filepath and directory manipulation
 - __json__ to read / write JSON manifest
 

In [19]:
def deflate_manifest():
    """
    Deflates the compressed manifest file using bz2 decompression. Note this will create a ~350 MB file in the
    UNCOMPRESSED_DATA_DIRECTORY_PATH.

    :return: None.
    """
    with bz2.BZ2File(COMPRESSED_MANIFEST_FILE_PATH, 'rb') as input_file:
        with open(UNCOMPRESSED_MANIFEST_FILE_PATH, 'w') as output_file:
            json.dump(pickle.load(input_file), output_file)
    return None


def deflate_images():
    """
    Deflates the compressed images file using zip decompression. Note this will create 4000 images in the
    UNCOMPRESSED_IMAGES_DIRECTORY_PATH directory.

    :return: None.
    """
    with ZipFile(COMPRESSED_IMAGES_FILE_PATH, 'r') as input_file:
        input_file.extractall(UNCOMPRESSED_IMAGES_DIRECTORY_PATH)
    return None


def check_and_deflate_all():
    """
    Checks for expected input files and a non-existent UNCOMPRESSED_DATA_DIRECTORY_PATH directory, then
    deflates the manifest and images. 

    :raises FileNotFoundError if expected input files are not found
    :raises FileExistsError if UNCOMPRESSED_DATA_DIRECTORY_PATH exists
    :returns: None.
    """

    # check if compressed data directory exists
    if os.path.isdir(COMPRESSED_DATA_DIRECTORY_PATH):

        # if so, check for compressed manifest file
        if not os.path.isfile(COMPRESSED_MANIFEST_FILE_PATH):
            raise_file_not_found_error_for_path('the compressed manifest file', COMPRESSED_MANIFEST_FILE_PATH)

        # and compressed images file
        if not os.path.isfile(COMPRESSED_IMAGES_FILE_PATH):
            raise_file_not_found_error_for_path('the compressed images file', COMPRESSED_IMAGES_FILE_PATH)

    else:
        raise raise_file_not_found_error_for_path('the compressed data directory', COMPRESSED_DATA_DIRECTORY_PATH)

    # check if uncompressed data directory exists
    if os.path.isdir(UNCOMPRESSED_DATA_DIRECTORY_PATH):
        raise FileExistsError(('The path to the output directory already exists: '
                               f'{get_full_path(UNCOMPRESSED_DATA_DIRECTORY_PATH)}. If you are sure you want to run this step '
                               'please remove this directory manually - we will not overwrite it.'))

    # inflate files
    else:
        os.mkdir(UNCOMPRESSED_DATA_DIRECTORY_PATH)
        deflate_manifest()
        deflate_images()

    return None

In [20]:
check_and_deflate_all()

FileExistsError: The path to the output directory already exists: C:\Users\Kevin\Desktop\EM213-Final_Project\data\uncompressed. If you are sure you want to run this step please remove this directory manually - we will not overwrite it.

## Load & Visually Inspect Data 
   - Note the previous step only needs to be run once to generate __/data/uncompressed__. 
   - In this step we will read in the data and check it against the manifest.
 