Data is located in gs://${GCS_BUCKET}/datasets.      
There should be bird, dog_breed, food, fungus, leaf folder.

The original zipped file is there.
The processed folder is also there

In [5]:
GCS_BUCKET = "np-training-public"


In [6]:
!gsutil ls gs://{GCS_BUCKET}/datasets

gs://np-training-public/datasets/bird/
gs://np-training-public/datasets/dog_breed/
gs://np-training-public/datasets/food/
gs://np-training-public/datasets/fungus/
gs://np-training-public/datasets/leaf/


In [3]:
#!mkdir -p data

# Download

In [8]:
import os

In [7]:
!gsutil -m cp -r  gs://{GCS_BUCKET}/datasets/food/* data/

Copying gs://np-training-public/datasets/food/food-101.tar.gz...
\ [1/1 files][  4.7 GiB/  4.7 GiB] 100% Done  94.7 MiB/s ETA 00:00:00           
Operation completed over 1 objects/4.7 GiB.                                      


**Important: Choose a dataset you would like to work with**

In [9]:
# sorted by size of data
# DATASET_NAME = "dog_breed"
#DATASET_NAME = "leaf"
DATASET_NAME = "food"
#DATASET_NAME = "bird"
#DATASET_NAME = "fungus"


In [10]:
DATASET_PATH= os.path.expanduser(f"~/data/{DATASET_NAME}")

In [11]:
ls {DATASET_PATH}

[0m[01;34mapple_pie[0m/           [01;34meggs_benedict[0m/            [01;34monion_rings[0m/
[01;34mbaby_back_ribs[0m/      [01;34mescargots[0m/                [01;34moysters[0m/
[01;34mbaklava[0m/             [01;34mfalafel[0m/                  [01;34mpad_thai[0m/
[01;34mbeef_carpaccio[0m/      [01;34mfilet_mignon[0m/             [01;34mpaella[0m/
[01;34mbeef_tartare[0m/        [01;34mfish_and_chips[0m/           [01;34mpancakes[0m/
[01;34mbeet_salad[0m/          [01;34mfoie_gras[0m/                [01;34mpanna_cotta[0m/
[01;34mbeignets[0m/            [01;34mfrench_fries[0m/             [01;34mpeking_duck[0m/
[01;34mbibimbap[0m/            [01;34mfrench_onion_soup[0m/        [01;34mpho[0m/
[01;34mbread_pudding[0m/       [01;34mfrench_toast[0m/             [01;34mpizza[0m/
[01;34mbreakfast_burrito[0m/   [01;34mfried_calamari[0m/           [01;34mpork_chop[0m/
[01;34mbruschetta[0m/          [01;34mfried_rice[0m/ 

In [12]:
import sys
sys.path.append("/home/ubuntu/fastai/")

In [13]:
import fastai
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
import helper


In [14]:
import pandas as pd
from tqdm import tqdm
import hashlib
from glob import glob


In [27]:
%matplotlib inline
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Generate labels.csv

In [16]:
!pwd

/home/ubuntu/deep_learning_workshop/training


In [17]:
!rm -rf models
!mkdir models

In [18]:
BASE_PATH= DATASET_PATH

# iterate over all the images, parse "class name/folder" and "image name"
records = []
for image in glob(f"{BASE_PATH}/**/*.jpg"):
    dir_ = image.split('/')
    file_, species = dir_[-1], dir_[-2]

    records.append({
        "file": image,
        "class": species.replace(" ", "_")
    })

labels_df = pd.DataFrame(records)
os.makedirs(f'data/{DATASET_NAME}',exist_ok=True)
labels_df.to_csv(f'data/{DATASET_NAME}/labels.csv', index=False)

In [21]:
labels_df

Unnamed: 0,class,file
0,filet_mignon,/home/ubuntu/data/food/filet_mignon/2150839.jpg
1,filet_mignon,/home/ubuntu/data/food/filet_mignon/2611025.jpg
2,filet_mignon,/home/ubuntu/data/food/filet_mignon/3360702.jpg
3,filet_mignon,/home/ubuntu/data/food/filet_mignon/1779620.jpg
4,filet_mignon,/home/ubuntu/data/food/filet_mignon/2100813.jpg
5,filet_mignon,/home/ubuntu/data/food/filet_mignon/2204089.jpg
6,filet_mignon,/home/ubuntu/data/food/filet_mignon/2584789.jpg
7,filet_mignon,/home/ubuntu/data/food/filet_mignon/2967960.jpg
8,filet_mignon,/home/ubuntu/data/food/filet_mignon/936538.jpg
9,filet_mignon,/home/ubuntu/data/food/filet_mignon/3674517.jpg


In [19]:
!ls

converted_model.tflite	     graph.lite		   model_benchmark
data			     helper.py		   models
dobgreed_mobilenet_tf.ipynb  keras.ipynb	   prep.ipynb
dobgreed_resnet.ipynb	     keras_parallel.ipynb  __pycache__
dog_breed.h5		     keras_parallel.py	   sector_labelling.ipynb
dog_breed.mlmodel	     labels.csv		   tensorflow_export.ipynb
dog_breed_tf.h5		     labels.txt		   tensorflow_tf_keras.ipynb
downloads		     MobileNet.ipynb	   tmp
fastai.ipynb		     model


In [22]:
labels_df.head()

Unnamed: 0,class,file
0,filet_mignon,/home/ubuntu/data/food/filet_mignon/2150839.jpg
1,filet_mignon,/home/ubuntu/data/food/filet_mignon/2611025.jpg
2,filet_mignon,/home/ubuntu/data/food/filet_mignon/3360702.jpg
3,filet_mignon,/home/ubuntu/data/food/filet_mignon/1779620.jpg
4,filet_mignon,/home/ubuntu/data/food/filet_mignon/2100813.jpg


**Number of unqiue categories **

In [23]:
num_classes = len(labels_df['class'].unique())
num_classes

101

In [24]:
labels_df['class'].value_counts()[0:10]

fried_calamari         1000
huevos_rancheros       1000
bruschetta             1000
spring_rolls           1000
hot_and_sour_soup      1000
tacos                  1000
waffles                1000
red_velvet_cake        1000
spaghetti_bolognese    1000
tuna_tartare           1000
Name: class, dtype: int64

# View Sample Images

In [28]:
helper.display_images_from_class(labels_df,label='eskimo_dog',num_images=10 )

<Figure size 3600x2160 with 0 Axes>

In [26]:
helper.display_images_from_class(labels_df,label='french_bulldog',num_images=10 )

<Figure size 3600x2160 with 0 Axes>

In [29]:
PATH = DATASET_PATH


**Save only first n classes **

In [39]:
n = 10
topn_classes = labels_df['class'].value_counts()[0:n].index.tolist()

In [40]:
labels_df=labels_df[labels_df['class'].isin(topn_classes)]

In [41]:
labels_df.to_csv(f'data/{DATASET_NAME}/labels.csv')

** Validation data **

In [42]:
label_csv = f'data/{DATASET_NAME}/labels.csv'
n = len(list(open(label_csv)))-1
val_idxs = get_cv_idxs(n)


# Save Subset

In [43]:
def save_images(list_images,dest='train'):
    for img_path in tqdm(list_images):
        paths = img_path.split("/")
        class_name, img_file = paths[-2],paths[-1]
        
        os.makedirs(f"{dest}/{class_name}",exist_ok=True)
        shutil.copy(img_path,f"{dest}/{class_name}/{img_file}")
        

In [44]:
shutil.rmtree(f'data/{DATASET_NAME}/train',ignore_errors=True)
shutil.rmtree(f'data/{DATASET_NAME}/valid',ignore_errors=True)

In [45]:
train_set = labels_df[~labels_df.index.isin(val_idxs)]
save_images(train_set['file'], dest=f'data/{DATASET_NAME}/train')

save_images(labels_df.iloc[val_idxs]['file'], dest=f'data/{DATASET_NAME}/valid')


100%|██████████| 9377/9377 [00:01<00:00, 5884.37it/s]
100%|██████████| 2000/2000 [00:00<00:00, 6078.77it/s]


In [46]:
train_set['file']

1000     /home/ubuntu/data/food/red_velvet_cake/3026588...
1001     /home/ubuntu/data/food/red_velvet_cake/1297812...
1003     /home/ubuntu/data/food/red_velvet_cake/249556.jpg
1004      /home/ubuntu/data/food/red_velvet_cake/25552.jpg
1005     /home/ubuntu/data/food/red_velvet_cake/2229825...
1006     /home/ubuntu/data/food/red_velvet_cake/1225708...
1007     /home/ubuntu/data/food/red_velvet_cake/1222541...
1008     /home/ubuntu/data/food/red_velvet_cake/2133877...
1011     /home/ubuntu/data/food/red_velvet_cake/528156.jpg
1012     /home/ubuntu/data/food/red_velvet_cake/2078400...
1013     /home/ubuntu/data/food/red_velvet_cake/1721568...
1014     /home/ubuntu/data/food/red_velvet_cake/2359267...
1015     /home/ubuntu/data/food/red_velvet_cake/2577022...
1016     /home/ubuntu/data/food/red_velvet_cake/372523.jpg
1017     /home/ubuntu/data/food/red_velvet_cake/235671.jpg
1019     /home/ubuntu/data/food/red_velvet_cake/368686.jpg
1021     /home/ubuntu/data/food/red_velvet_cake/1423857.