# Checklist

- Import the data to workspace
- Output must be in encoded pixels so we'll need to create the encoding fuction and keep in mind the .25-factor reduction per side of the 1400 x 2100 original image size.
- Choose the models we want to train
- Train the models

# Data Processing

## Download the data from Kaagle and store it properly

In [1]:
!bash ../src/data/import_data.sh

../src/data/import_data.sh: line 4: .env: No such file or directory
File exists.


## Get a snapshot view of the data

In [2]:
from pandas import read_csv

In [3]:
df_train = read_csv('../data/raw/train.csv')
df_train.head()

Unnamed: 0,Image_Label,EncodedPixels
0,0011165.jpg_Fish,264918 937 266318 937 267718 937 269118 937 27...
1,0011165.jpg_Flower,1355565 1002 1356965 1002 1358365 1002 1359765...
2,0011165.jpg_Gravel,
3,0011165.jpg_Sugar,
4,002be4f.jpg_Fish,233813 878 235213 878 236613 878 238010 881 23...


In [4]:
df_train.dtypes

Image_Label      object
EncodedPixels    object
dtype: object

In [17]:
df_train.describe()

Unnamed: 0,EncodedPixels,Image,Label
count,11836,22184,22184
unique,11836,5546,4
top,264918 937 266318 937 267718 937 269118 937 27...,0011165.jpg,Fish
freq,1,4,5546


In [18]:
len(df_train)

22184

Notice that the label needs to be parsed from the Image_Label column and that the same column is an object data type, while it makes sense for it to be a string type.

In [13]:
from pandas import DataFrame

def extract_labels_from_training_data(df_train_input: DataFrame):
    # Since the column Image_label encodes the filename and label separated by a single underscore
    df_train_input['Image'] = df_train_input.Image_Label.map(lambda v: v[:v.index('_')])
    df_train_input['Label'] = df_train_input.Image_Label.map(lambda v: v[v.index('_')+1:])
    df_train_input = df_train_input.drop(columns=['Image_Label'])
    # df_train_input['label_index'] = df_train_input.Label.map(dp_params.label_mapping)
    return df_train_input

def correct_dtypes(df_train_input: DataFrame):
    df_train_input = df_train_input.convert_dtypes()
    return df_train_input

# Apply these functions
df_train = extract_labels_from_training_data(df_train)
df_train = correct_dtypes(df_train)

In [14]:
df_train.head()

Unnamed: 0,EncodedPixels,Image,Label
0,264918 937 266318 937 267718 937 269118 937 27...,0011165.jpg,Fish
1,1355565 1002 1356965 1002 1358365 1002 1359765...,0011165.jpg,Flower
2,,0011165.jpg,Gravel
3,,0011165.jpg,Sugar
4,233813 878 235213 878 236613 878 238010 881 23...,002be4f.jpg,Fish


In [15]:
df_train.dtypes

EncodedPixels    string[python]
Image            string[python]
Label            string[python]
dtype: object

## Import config file and environmental variables

In [23]:
from dotenv import load_dotenv
from os import getenv

import hydra
from omegaconf import DictConfig

load_dotenv()

config_path_env = getenv('CONFIG_PATH')
config_name_env = getenv('CONFIG_NAME')

@hydra.main(config_path=config_path_env, config_name=config_name_env, version_base=None)
def get_data_processing_config_dict(config: DictConfig):
    return config.data_processing

print(config_name_env, config_path_env)
# dp_conf = get_data_processing_config_dict()
# print(dp_conf)

conf ..


usage: ipykernel_launcher.py [--help] [--hydra-help] [--version]
                             [--cfg {job,hydra,all}] [--resolve]
                             [--package PACKAGE] [--run] [--multirun]
                             [--shell-completion] [--config-path CONFIG_PATH]
                             [--config-name CONFIG_NAME]
                             [--config-dir CONFIG_DIR]
                             [--experimental-rerun EXPERIMENTAL_RERUN]
                             [--info [{all,config,defaults,defaults-tree,plugins,searchpath}]]
                             [overrides ...]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## Create training dataset

In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [27]:
image_size = (1400, 2100)
batch_size = len(df_train) / 50
validation_split = 0.3
subset = "both"
seed = 42
image_directory = "../data/raw/train_images/"

train_ds, val_ds = tf.keras.utils.image_dataset_from_directory(
    image_directory,
    validation_split=validation_split,
    subset=subset,
    seed=seed,
    image_size=image_size,
    batch_size=batch_size,
)

Found 0 files belonging to 0 classes.
Using 0 files for training.
Using 0 files for validation.


ValueError: No training images found in directory ../data/raw/train_images/. Allowed formats: ('.bmp', '.gif', '.jpeg', '.jpg', '.png')