In [9]:
import os
import subprocess

from six import string_types

# Make sure you have all of these packages installed, e.g. via pip
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import scipy
from skimage import io
from scipy import ndimage
from IPython.display import display
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [10]:
from glob import glob

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Flatten, Lambda
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np
import math
from keras.optimizers import SGD,Adam
from keras.layers.convolutional import ZeroPadding2D, Convolution2D, Conv2D
from keras.layers.pooling import MaxPooling2D
from keras.preprocessing import image, sequence
import numpy as np
import argparse

In [12]:
DATA_ROOT=os.environ.get('DATA')
DATA_DIR='planet'
IMG_TYPE='tif'

In [13]:
ROOT=f'{DATA_ROOT}/{DATA_DIR}'
JPG_DIR = os.path.join(ROOT, 'train-jpg')
TIF_DIR = os.path.join(ROOT, 'train-tif')
LABEL_CSV = os.path.join(ROOT, 'train.csv')

In [14]:
if IMG_TYPE=='tif':
    BANDS=4
    IMG_DIR=TIF_DIR
else:
    BANDS=3
    IMG_DIR=JPG_DIR

In [15]:
print(IMG_TYPE,BANDS,IMG_DIR)

tif 4 /home/brook/data/planet/train-tif


## SAMPLE RUN:
_Instead of Sample Images/directories, we'll create a sample csv that reads from the full directory to get images_

In [16]:
# # USE THESE SETTINGS FOR DEVELOPMENT
# TRAIN_SIZE=20
# VALID_SIZE=5
# BATCH_SIZE=2

In [17]:
# USE THESE SETTINGS FOR A REALISTIC SAMPLE
TRAIN_SIZE=200
VALID_SIZE=50
BATCH_SIZE=32

In [18]:
labels_df = pd.read_csv(LABEL_CSV)
labels_df.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [19]:
tags_list=[tags.split(' ') for tags in labels_df.tags.values]
label_list=list(set([tag for tags in tags_list for tag in tags]))
print(label_list)

['artisinal_mine', 'haze', 'water', 'conventional_mine', 'agriculture', 'cultivation', 'bare_ground', 'primary', 'habitation', 'partly_cloudy', 'blooming', 'blow_down', 'slash_burn', 'road', 'clear', 'selective_logging', 'cloudy']


In [20]:
# HARD CODE LABELS LIST TO ENSURE ORDER
LABELS_LIST=['blooming', 'agriculture', 'primary', 'selective_logging', 'road', 'conventional_mine', 'blow_down', 'artisinal_mine', 'partly_cloudy', 'cloudy', 'habitation', 'haze', 'bare_ground', 'cultivation', 'slash_burn', 'water', 'clear']

In [21]:
# NEW DF WITH TAG-VECTOR INSTEAD OF TAGS
def tags_to_vec(tags):
    tags=tags.split(' ')
    return [int(label in tags) for label in LABELS_LIST]

labels_df['vec']=labels_df.tags.apply(tags_to_vec)
labels_df.drop(['tags'],axis=1,inplace=True)
labels_df.head()

Unnamed: 0,image_name,vec
0,train_0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,train_1,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,train_2,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,train_3,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,train_4,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [22]:
train_df=labels_df.sample(TRAIN_SIZE)

In [23]:
print(train_df.shape)
train_df.head()

(200, 2)


Unnamed: 0,image_name,vec
32316,train_32316,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
24965,train_24965,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
22283,train_22283,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
20657,train_20657,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16162,train_16162,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [24]:
tmpdf=labels_df.drop(train_df.index)

In [25]:
print(labels_df.shape,tmpdf.shape)

(40479, 2) (40279, 2)


In [26]:
valid_df=tmpdf.sample(VALID_SIZE)
print(valid_df.shape)
valid_df.head()

(50, 2)


Unnamed: 0,image_name,vec
35173,train_35173,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
29666,train_29666,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
30224,train_30224,"[0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13093,train_13093,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
35800,train_35800,"[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ..."


In [27]:
# save train/valid sample csvs

In [28]:
TRAIN_CSV='train-sample.csv'
VALID_CSV='valid-sample.csv'

In [29]:
# COMMENTED OUT SO WE DON'T ACCIDENTLY OVERWRITE (EXCEPT FIRST RUN OF NEW SAMPLE SIZE)
train_df.to_csv(TRAIN_CSV,index=False,sep=' ')
valid_df.to_csv(VALID_CSV,index=False,sep=' ')

## GENERATOR

In [30]:
class CSVGen():    
    def __init__(self,file,batch_size=32):
        self.file=file
        self.batch_size=batch_size
        self._set_data()

    def list_data(self):
        batch_index=0
        start=batch_index*self.batch_size
        if ((start+self.batch_size)>=self.size):
            self.labels, self.paths = shuffle(self.labels,self.paths)
            batch_index=0
        batch_labels=self.labels[start:start+self.batch_size]
        batch_paths=self.paths[start:start+self.batch_size]
        batch_imgs=[self._imdata(img) for img in batch_paths]
        return np.array(batch_imgs),np.array(batch_labels)

    def data(self):
        batch_index=0
        while True:
            start=batch_index*self.batch_size
            if ((start+self.batch_size)>=self.size):
                self.labels, self.paths = shuffle(self.labels,self.paths)
                batch_index=0
            batch_labels=self.labels[start:start+self.batch_size]
            batch_paths=self.paths[start:start+self.batch_size]
            batch_imgs=[self._imdata(img) for img in batch_paths]
            yield np.array(batch_imgs),np.array(batch_labels)
            batch_index+=1
    
    
    def _imdata(self,path):
        return io.imread(path)
    
    
    def _set_data(self):
        df=pd.read_csv(self.file,sep=' ')
        df['image_paths']=df.image_name.apply(lambda n: f'{IMG_DIR}/{n}.{IMG_TYPE}')
        df.drop(['image_name'],axis=1,inplace=True)
        df['vec']=df.vec.apply(lambda v: list(eval(v)))
        labels=df.vec.values.tolist()
        self.size=len(labels)
        paths=df.image_paths.values.tolist()
        self.labels, self.paths = shuffle(labels,paths)

Here are the validation vectors

In [31]:
vg=CSVGen(VALID_CSV)

In [32]:
vg.list_data()[1][0]

array([0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

## VGG INSPIRED DUMMY MODEL

In [41]:
BATCH_INPUT_SHAPE=(None,256,256,BANDS)
DEFAULT_LR=0.001
DEFAULT_DR=0.5

def ConvBlock(model,layers,filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    return model


def FCBlock(model,dr=DEFAULT_DR,output_dim=200):
    model.add(Dense(output_dim, activation='relu'))
    model.add(Dropout(dr))
    return model


model=Sequential()
model.add(BatchNormalization(batch_input_shape=BATCH_INPUT_SHAPE))
model=ConvBlock(model,2,32)
model.add(Flatten())
model=FCBlock(model)
model.add(Dense(17, activation='sigmoid'))

#
#  USING 'binary crossentropy' because I saw it in a Kaggle Kernal. There was a comment ...
#        - We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
#  ** We should understand if this is right and why **
#
model.compile(loss='binary_crossentropy', 
              optimizer='adam',
              metrics=['accuracy'])

In [42]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_2 (Batch (None, 256, 256, 4)       16        
_________________________________________________________________
zero_padding2d_3 (ZeroPaddin (None, 258, 258, 4)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 256, 256, 32)      1184      
_________________________________________________________________
zero_padding2d_4 (ZeroPaddin (None, 258, 258, 32)      0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 256, 256, 32)      9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 128, 128, 32)      0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 524288)            0         
__________

a dumb example showing that CSVGen.list_data works

In [43]:
d=CSVGen(TRAIN_CSV,batch_size=BATCH_SIZE).list_data()
d=d[:10]
model.fit(d[0],d[1],
          batch_size=1,
          epochs=1,
          verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fa61c10d5c0>

## GO!

In [44]:
def gen_params(nb_epochs):
    # steps_per_epoch: Total number of steps (batches of samples) to yield from generator before declaring one epoch finished
    # - It should typically be equal to the number of unique samples of your dataset divided by the batch size.
    s=math.floor(TRAIN_SIZE/nb_epochs)
    # validation_steps: Number of steps to yield from validation generator at the end of every epoch. 
    # - It should typically be equal to the number of unique samples of your validation dataset divided by the batch size.
    vs=math.floor(VALID_SIZE/nb_epochs)
    return nb_epochs,s,vs


In [45]:
# # WITHOUT VALIDATION
# nb_epochs,steps,validation_steps=gen_params(1)
# model.fit_generator(
#         nb_epoch=nb_epochs, 
#         generator=CSVGen(TRAIN_CSV,batch_size=BATCH_SIZE).data(), 
#         steps_per_epoch=steps,
#         verbose=1)

In [46]:
# WITH VALIDATION
nb_epochs,steps,validation_steps=gen_params(6)
model.fit_generator(
        nb_epoch=nb_epochs, 
        generator=CSVGen(TRAIN_CSV,batch_size=BATCH_SIZE).data(), 
        validation_data=CSVGen(VALID_CSV,batch_size=BATCH_SIZE).data(),
        steps_per_epoch=steps,
        validation_steps=validation_steps,
        verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fa5f87ab400>