dataset.py

import cv2
import h5py
import os.path
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm


def load_train_dataset(df_train, img_size):
    file_path = 'data/x_train.h5'

    if os.path.exists(file_path):
        with h5py.File(file_path, 'r') as hf:
            x_train = hf['x_train'][:]

    else:
        x_train = []
        for f, breed in tqdm(df_train.values):
            img = cv2.imread('data/train/{}.jpg'.format(f))
            x_train.append(cv2.resize(img, (img_size, img_size)))

        with h5py.File(file_path, 'w') as hf:
            hf.create_dataset("x_train", data=x_train)

    return x_train


def load_test_dataset(df_test, img_size):
    file_path = 'data/x_test.h5'

    if os.path.exists(file_path):
        with h5py.File(file_path, 'r') as hf:
            x_test = hf['x_test'][:]

    else:
        x_test = []
        for f in tqdm(df_test['id'].values):
            img = cv2.imread('data/test/{}.jpg'.format(f))
            x_test.append(cv2.resize(img, (img_size, img_size)))

        with h5py.File(file_path, 'w') as hf:
            hf.create_dataset("x_test", data=x_test)

    return x_test


def load_train_labels(df_train, one_hot_labels):
    file_path = 'data/y_train.h5'

    if os.path.exists(file_path):
        with h5py.File(file_path, 'r') as hf:
            y_train = hf['y_train'][:]

    else:
        y_train = []
        for i in tqdm(range(len(df_train.values))):
            label = one_hot_labels[i]
            y_train.append(label)

        with h5py.File(file_path, 'w') as hf:
            hf.create_dataset("y_train", data=y_train)

    return y_train


# def load_train_dataset(df_train, img_size):
#     x_train = []
#     for f, breed in tqdm(df_train.values):
#         img = cv2.imread('data/train/{}.jpg'.format(f))
#         x_train.append(cv2.resize(img, (img_size, img_size)))
#
#     return x_train
#
#
# def load_test_dataset(df_test, img_size):
#     x_test = []
#     for f in tqdm(df_test['id'].values):
#         img = cv2.imread('data/test/{}.jpg'.format(f))
#         x_test.append(cv2.resize(img, (img_size, img_size)))
#
#     return x_test
#
#
# def load_train_labels(df_train, one_hot_labels):
#     y_train = []
#     for i in tqdm(range(len(df_train.values))):
#         label = one_hot_labels[i]
#         y_train.append(label)
#
#     return y_train


def plot_loss_accuracy(history):
    plt.figure(1)
    # summarize history for accuracy
    plt.subplot(211)
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')

    # summarize history for loss
    plt.subplot(212)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()


def output_submission(preds, one_hot, df_test):
    submission = pd.DataFrame(preds)
    # Set column names to those generated by the one-hot encoding earlier
    col_names = one_hot.columns.values
    submission.columns = col_names
    # Insert the column id from the sample_submission at the start of the data frame
    submission.insert(0, 'id', df_test['id'])

    submission.to_csv('submissions/submission.csv', encoding='utf-8', index=False)
    print('submission outputted')