## Warnings:
> Make sure that your VM has full permissions right (need rw GCS, and automl)

In [1]:
# !pip install google-cloud
# !pip install google-cloud-storage
# !pip install google-cloud-automl
# !pip install opencv-python
# !pip install PyMuPDF

In [76]:
import pandas as pd
import numpy as np
# from google.cloud import automl_v1beta1 as automl
# from google.cloud import storage

import os
import argparse
import shutil

import fitz # pip install PyMuPDF
import cv2 # pip install opencv-python
from PIL import Image
import csv
import matplotlib.pyplot as plt

import scipy.misc
from math import ceil
from imgaug import augmenters as iaa
import imgaug as ia

### 1) Preprocessing pdf

In [3]:
def pdf_to_png(pdf_filename) :

    pdf_dirname = os.path.dirname(pdf_filename)
    
    pdf_basename = os.path.basename(pdf_filename)
    pdf_root, pdf_ext = os.path.splitext(pdf_basename)
    img_filename = './data/{}.png'.format(pdf_root)
    
    if pdf_ext == '.pdf':
    
        pdf_document = fitz.open(pdf_filename)

        page_pixmap = pdf_document.getPagePixmap(0,alpha=False)
        page_pixmap.writePNG(img_filename)
        return pdf_document.close()
    

def crop_img(img_filename):
    """
    the first two numbers define the top-left coordinates of the outtake (x,y), 
    while the last two define the right-bottom coordinates of the outtake.
    """
    img = Image.open(img_filename)
    img_basename = os.path.basename(img_filename)
    img_root, img_ext = os.path.splitext(img_basename)
    img2 = img.crop((23, 65, 780, 430))
    return img2.save("./data/{}_cropped{}".format(img_root,img_ext))

In [4]:
def slice_image(img_path, overlap=0.5, filter_size=(100,100,3)):
    """
    Overlap is defined as the coverage between one patch to the next.
    Regardless of the axis of the next image (horizontal or vertical)
    Input:
    ------
    image: np.array
    overlap: percentage
    slices: tuple with patch shape used for model
    Output:
    ------
    """
    #assumption: the height and width of input image is same
    image = np.array(Image.open(img_path).convert('RGB'))
    #import pdb
    #pdb.set_trace()
    img_basename = os.path.basename(img_path)
    img_root, img_ext = os.path.splitext(img_basename)
    
    m, n, depth = image.shape
    m_patch, n_patch , depth= filter_size
    stride = int(round(m_patch*(1-overlap)))
    # p: number of smaller images
    num_m, num_n = int(-(-m//stride)), int(-(-n//stride))
    p = num_m * num_n

    # pad: extra pixels needed to be added
    pad_x = stride * num_m + (m_patch - stride) - m
    pad_y = stride * num_n + (n_patch - stride) - n
    # padd: pixels to be added in each side of the image
    padd_x = int(pad_x / 2)
    padd_y = int(pad_y / 2)
    #import pdb
    #pdb.set_trace()
    new_image = np.zeros((num_m*stride+(m_patch-stride), num_n*stride+(n_patch-stride), depth))
    
    new_image[padd_x:m+padd_x, padd_y:n+padd_y, :] = image

    sliced_bytes = []
    sliced_images = []
    for i in range(num_m):
        for j in range(num_n):
            #sliced_bytes.append(new_image[i*stride:i*stride+m_patch, j*stride:j*stride+n_patch].tobytes())
            sliced_images.append(new_image[i*stride:i*stride+m_patch, j*stride:j*stride+n_patch, :])    
    
    if os.path.isdir('./data/{}'.format(img_root)) == True:
        for counter, img in enumerate(sliced_images):
            scipy.misc.imsave("./data/{}/{}_slice{}{}".format(img_root,img_root,
                                                              counter,img_ext), img)
    else:
        os.mkdir('./data/{}'.format(img_root))
        for counter, img in enumerate(sliced_images):
            scipy.misc.imsave("./data/{}/{}_slice{}{}".format(img_root,img_root,
                                                              counter,img_ext), img)
    return 


* Execute preprocessing

In [5]:

# # Earth images
# for i in range(1,11):
#     pdf_to_png('./data/image{}.pdf'.format(i))
#     crop_img('./data/image{}.png'.format(i))
#     slice_image('./data/image{}_cropped.png'.format(i),
#                 filter_size=(200,200,3),overlap=0)
    
# # Mars images
# for i in range(1,7):
#     #pdf_to_png('./data/mars_image{}.pdf'.format(i))
#     try:
#         crop_img('./data/mars_image{}.png'.format(i))
#         slice_image('./data/mars_image{}_cropped.png'.format(i),filter_size=(200,200,3),overlap=0)
#     except:
#         crop_img('./data/mars_image{}.jpg'.format(i))
#         slice_image('./data/mars_image{}_cropped.jpg'.format(i),filter_size=(200,200,3),overlap=0)
        

`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


### 2) Create dataset

In [80]:

project_id = 'aketari-sandbox-vision'
compute_region = 'us-central1'
dataset_name = 'WhaleTail'
dataset_gcs_path = 'gs://'
bucket_name = 'aketari-sandbox-vision-vcm'
csv_VM_path = './whale_data/{}.csv'.format(dataset_name)
model_name_prefix = 'WhaleTail_'

# automl_client = automl.AutoMlClient()
# storage_client = storage.Client()
# prediction_client = automl.PredictionServiceClient()

# A resource that represents Google Cloud Platform location.
#project_location = automl_client.location_path(project_id, compute_region)

* Create, Append and Export csv

In [69]:
def org_img_folders(label):
    """Organize img in their corresponding label folder.
    The folder needs to be classed whale_data and contain all images"""
    os.mkdir('./whale_data/{}'.format(label))
    lst_img_label = list(df[df['Id']=='{}'.format(label)].Image.values)
    for img in lst_img_label:
        shutil.move('./whale_data/{}'.format(img),
                    './whale_data/{}/{}'.format(label,img))
    return
    

In [3]:
df = pd.read_csv('./WhaleTail.csv')
labels_list = list(df.Id.value_counts().keys())

In [5]:
labels_list = list(df.Id.value_counts().keys())

In [71]:
# Organize folder per label
for label in labels_list:
    org_img_folders(label)

In [16]:
df_agg = df.groupby('Id').agg({'Image':'count'}).reset_index()\
                                        .sort_values(by='Image',
                                                 ascending=False)



In [53]:
print ('There are {} images of whale tails'.format(df_agg.Image.sum()))

There are 25361 images of whale tails


In [45]:
def nbr_class_wt_n_images(n):
    return df_agg[df_agg['Image']==n].Image.sum()

## Organize your model datasets
* Hypothesis:

new_whale images are wrongly unclassified.

* Verify hypothesis:

predict new_whale images on models.

In [72]:
def compute_nbr_labels(label):
        nbr_files = []
        for fileList in os.listdir('./whale_data/{}'.format(label)):
            nbr_files.append(fileList)
        return nbr_files

In [None]:
def augment_dataset_1m(label):

    nbr_files = compute_nbr_labels(label)

    for i in range(0,len(nbr_files)):
        # Setting minimum 100 images per class
        if (100-len(nbr_files)) > 0:
            # Augmenting by the factor that is dependent on nbr images
            # already present
            for augment_idx in range(0,int(ceil((100-len(nbr_files))/len(nbr_files)))):
                try:
                    image = np.array(Image.open('./whale_data/{}/{}'.format(label,nbr_files[i])))
                    # If image is grayscale, resize to 3 channel np array
                    if len(image.shape) ==2:
                        image = np.resize(image, (image.shape[0], image.shape[1], 3))
                    
                    # data aug methods requires 3D arrays
                    result = seq.augment_image(image)
                    scipy.misc.imsave("./whale_data/{}/aug{}{}".format(label,augment_idx,nbr_files[i]),result)
                except Exception:
                    pass


In [None]:
# Isolate these classes and build a model just for these
model_1_dataset_labels = list(df_agg[1:6].Id.values)
model_2_dataset_labels = list(df_agg[6:].Id.values)

In [86]:
# Step 2: create CSV
def export_automl_csv(curated_label_list, dataset_name):
    csv_VM_path= './automl_WhaleTail_{}.csv'.format(dataset_name)
    with open(csv_VM_path,'w') as newFile:
        newFileWriter = csv.writer(newFile)
        for label in curated_label_list:
            lst_files = compute_nbr_labels(label)
            for i in range(len(lst_files)):
                img_gcs_path = 'gs://{}/WhaleTail/{}'\
                                .format(bucket_name,lst_files[i])
                newFileWriter.writerow([img_gcs_path,label])



In [87]:
export_automl_csv(model_2_dataset,'hypothese1Dataset2')

In [27]:
# Sometimes(0.5, ...) applies the given augmenter in 50% of all cases,
# e.g. Sometimes(0.5, GaussianBlur(0.3)) would blur roughly every second image.
sometimes = lambda aug: iaa.Sometimes(0.5, aug)

# Define our sequence of augmentation steps that will be applied to every image
# All augmenters with per_channel=0.5 will sample one value _per image_
# in 50% of all cases. In all other cases they will sample new values
# _per channel_.
seq = iaa.Sequential(
    [
        # apply the following augmenters to most images
        iaa.Fliplr(0.5), # horizontally flip 50% of all images
        iaa.Flipud(0.2), # vertically flip 20% of all images
        # crop images by -5% to 10% of their height/width
        sometimes(iaa.CropAndPad(
            percent=(-0.05, 0.1),
            pad_mode=ia.ALL,
            pad_cval=(0, 255)
        )),
        sometimes(iaa.Affine(
            scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
            translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
            rotate=(-45, 45), # rotate by -45 to +45 degrees
            shear=(-16, 16), # shear by -16 to +16 degrees
            order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
            cval=(0, 255), # if mode is constant, use a cval between 0 and 255
            mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
        )),
        # execute 0 to 5 of the following (less important) augmenters per image
        # don't execute all of them, as that would often be way too strong
        iaa.SomeOf((0, 5),
            [
                sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
                iaa.OneOf([
                    iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0
                    iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7
                    iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7
                ]),
                iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images
                iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
                # search either for all edges or for directed edges,
                # blend the result with the original image using a blobby mask
                iaa.SimplexNoiseAlpha(iaa.OneOf([
                    iaa.EdgeDetect(alpha=(0.5, 1.0)),
                    iaa.DirectedEdgeDetect(alpha=(0.5, 1.0), direction=(0.0, 1.0)),
                ])),
                iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images
                iaa.OneOf([
                    iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels
                    iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),
                ]),
                iaa.Invert(0.05, per_channel=True), # invert color channels
                iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
                iaa.AddToHueAndSaturation((-20, 20)), # change hue and saturation
                # either change the brightness of the whole image (sometimes
                # per channel) or change the brightness of subareas
                iaa.OneOf([
                    iaa.Multiply((0.5, 1.5), per_channel=0.5),
                    iaa.FrequencyNoiseAlpha(
                        exponent=(-4, 0),
                        first=iaa.Multiply((0.5, 1.5), per_channel=True),
                        second=iaa.ContrastNormalization((0.5, 2.0))
                    )
                ]),
                iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast
                iaa.Grayscale(alpha=(0.0, 1.0)),
                sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
                sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))), # sometimes move parts of the image around
                sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.1)))
            ],
            random_order=True
        )
    ],
    random_order=True
)

In [6]:
# for VM_path, subdirList, fileList in os.walk('./whale_data'):
#     print len(subdirList)

In [None]:
# Recommend running this cell on the VM directly
# augmented_labels = 0
# for label in labels_list:
#     augment_dataset_1m(label)
#     augmented_labels +=1
#     print (augmented_labels)

In [None]:
df

In [6]:
def upload_blob(local_path, bucket_name,dataset_name):
    """Upload CSV file."""

    csv_VM_path = './data/{}.csv'.format(dataset_name)
    bucket = storage_client.get_bucket(bucket_name)

    output_gcs_path = 'csv/{}.csv'.format(dataset_name)
    blob = bucket.blob(output_gcs_path)
    blob.upload_from_filename(csv_VM_path)
    return


In [6]:
upload_blob(csv_VM_path,bucket_name,dataset_name)

1) Statistic on the class imbalance

2) separation of models in a logical manner (1,000,000 images per dataset)

In [None]:
"""
Take the 5000 classes with the least amount of images
"""

* Create automl dataset

In [7]:
def create_automl_dataset(project_id,compute_region,dataset_name):
    """Create a placeholder dataset."""
#     project_id = args['project_id']
#     compute_region = args['compute_region']
#     dataset_name = args['dataset_name']

    # Classification type is assigned based on multilabel value.
    classification_type = "MULTICLASS"

    # Specify the image classification type for the dataset.
    dataset_metadata = {"classification_type": classification_type}
    # Set dataset name and metadata of the dataset.
    my_dataset = {
        "display_name": dataset_name,
        "image_classification_dataset_metadata": dataset_metadata,
    }
    
    # Create dataset
    # First verify that the name doesnt already exists
    list_dataset = automl_client.list_datasets(project_location)
    for dataset in list_dataset:
        if dataset.display_name == dataset_name:
            dataset_id = dataset.name.split("/")[-1]
            print ("dataset name already used by dataset id: {}".format(dataset_id))
            
            dataset_full_id = automl_client.dataset_path(project_id, 
                                                  compute_region, 
                                                  dataset_id)

            # Get complete detail of the dataset.
            dataset_info = automl_client.get_dataset(dataset_full_id)
            return dataset_info
               
    # Create new one
    dataset_info = automl_client.create_dataset(project_location, my_dataset)
    return dataset_info
        

In [44]:
dataset_info = create_automl_dataset(project_id,
                                     compute_region,
                                     dataset_name)

dataset name already used by dataset id: ICN6639741849712435570


In [8]:
def import_dataset(project_id,compute_region, 
                   dataset_info, dataset_name):
    """Fill in the dataset placeholder."""
    dataset_id = dataset_info.name.split("/")[-1]
#     project_id = args['project_id']
#     compute_region = args['compute_region']

    gcs_csv_path = 'gs://aketari-sandbox-vision-vcm/csv/{}.csv'.format(dataset_name)
    # Get the full path of the dataset.
    dataset_full_id = automl_client.dataset_path(
        project_id, compute_region, dataset_id
    )

    # Get the multiple Google Cloud Storage URIs.
    input_uris = gcs_csv_path.split(",")
    input_config = {"gcs_source": {"input_uris": input_uris}}

    # Import data from the input URI.
    response = automl_client.import_data(dataset_full_id, input_config)

    print("Processing import...")
    # synchronous check of operation status.
    print("Data imported. {}".format(response.result())) 
    print("Well, not exactly. You still have to wait about 15 min, \
        before training any model on this data")
    

In [46]:
import_dataset(project_id, compute_region, 
               dataset_info,dataset_name)

Processing import...
Data imported. 


* Train Automl model

In [9]:
def train_model(dataset_info, version, train_budget=1):
    """Train model"""
    
    dataset_id = dataset_info.name.split("/")[-1]
    model_name = model_name_prefix + str(version)
    
    models_list = automl_client.list_models(project_location)
    for model in models_list:
        if model.display_name == model_name:
            version += 1
            model_name = model_name_prefix + str(version)
    
    # Set model name and model metadata for the image dataset.
    my_model = {
        "display_name": model_name,
        "dataset_id": dataset_id,
        "image_classification_model_metadata": \
                                        {"train_budget": train_budget}
        if train_budget
        else {},
    }

    # Create a model with the model metadata in the region.
    model = automl_client.create_model(project_location, my_model)
    print ("Training operation name: {}".format(model.operation.name))
    return model
    

* Execute training

In [None]:
model = train_model(dataset_info,1,1)

* Select pre-trained models

In [14]:
# Choose a deployed model:
models_list = automl_client.list_models(project_location)
for model in models_list:
    print ('Model called: {}'.format(model.display_name))
    print ('Model id: {}'.format(model.name.split('/')[-1]))
    print ('==================')

Model called: Mars_model_1
Model id: ICN4656393435248390578
Model called: WaterOnMars_v20181209024327
Model id: ICN1070916309218507773


In [16]:
model_id = 'ICN4656393435248390578'
model_full_id = automl_client.model_path(project_id, 
                                         compute_region, 
                                         model_id)

# Get complete detail of the model.
model = automl_client.get_model(model_full_id)

* Prediction

In [22]:
def predict(model, bucket_name, gcs_path,
             local_path=None, score_threshold='0.5'):
    """
    gcs_path = folder_1/folder_n/image_title
    """
    model_id = model.name.split("/")[-1]
    # Get the full path of the model.
    model_full_id = automl_client.model_path(
        project_id, compute_region, model_id)

    # Read the image and assign to payload.
    if local_path == None:
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(gcs_path)
        content = blob.download_as_string()
    
    else:
        with open(local_path, "rb") as image_file:
            content = image_file.read()
            
    payload = {"image": {"image_bytes": content}}

    # params is additional domain-specific parameters.
    # score_threshold is used to filter the result
    # Initialize params
    params = {}
    if score_threshold:
        params = {"score_threshold": score_threshold}

    pred_response = prediction_client.predict(model_full_id, payload, params)
    print("Prediction results:")
    for result in pred_response.payload:
        print("Predicted class name: {}".format(result.display_name))
        print("Predicted class score: {}".format(result.classification.score))
    return pred_response


In [None]:
def export_pred_csv(img_title,pred):
    
    predicted_class = pred.display_name
    pred_path = './data/preds.csv'
    
    if os.path.isdir('./data/preds.csv') == True:
        with open(pred_path,'a') as newFile:
            newFile.writerow([img_title,predicted_class])
    else:
        os.mkdir('./data/preds.csv')
        with open(pred_path,'w') as newFile:
            newFileWriter = csv.writer(newFile)
            newFileWriter.writerow([img_title,predicted_class])
    return

In [23]:
predict(model,bucket_name,
              gcs_path='WaterOnMars/images/aILz6npGtaM.jpg')


Prediction results:
Predicted class name: NoWater
Predicted class score: 0.999988555908


payload {
  classification {
    score: 0.999988555908
  }
  display_name: "NoWater"
}

u'ICN3937997390383836716'