# Equipo 2
## Miembros:
### Jorge Arturo Torres Cruz - A01176590
### Juan Manuel Pérez Font - A00819815
### Sergio López Madriz - A01064725

## Librerías

In [2]:
# Utilizaremos urllib para descargar las imagenes utilizando URLs obtenidos de ImageNet
import urllib.request
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import uuid
from itertools import repeat
from pathlib import Path
import socket
import cv2
import numpy as np
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten
socket.setdefaulttimeout(10)

Using TensorFlow backend.


## Recolección de datos

In [3]:
# Para uso en colab
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
data_directory = './drive/My Drive/food_classification/data'
data_path = Path(data_directory)
url_files = [x for x in data_path.iterdir() if x.is_file()]
categories = ['wines', 'bubble_gums', 'dumplings', 'pizza', 'sandwich']

In [5]:
# Iteramos por cada uno de los archivos y obtenemos los URLs de las imagenes.
print("Reading files containing images urls")
urls = {}
for file_path in url_files:
    category = file_path.stem
    with file_path.open() as f:
        content = f.readlines()
        content = [url.strip() for url in content]
        print("Reading {} image urls ({})".format(len(content), file_path))
        urls[category] = content

Reading files containing images urls
Reading 1272 image urls (drive/My Drive/food_classification/data/dumplings.txt)
Reading 1229 image urls (drive/My Drive/food_classification/data/sandwich.txt)
Reading 1247 image urls (drive/My Drive/food_classification/data/wines.txt)
Reading 1209 image urls (drive/My Drive/food_classification/data/bubble_gums.txt)
Reading 1215 image urls (drive/My Drive/food_classification/data/pizza.txt)


In [5]:
working_urls = {
    'bubble_gums': [],
    'dumplings': [],
    'pizza': [],
    'sandwich': [],
    'wines': []
}

def download_from_url(category_url):
    category, url = category_url
    print(f'Downloading {url} for category {category}')
    try:
        urllib.request.urlretrieve(url, f'{data_path}/{category}/{uuid.uuid4()}.jpg')
        working_urls[category].append(url)
        return url
    except Exception as e:
        print('Error')
        return f'Error: {e}'

def download_category_from_url(category, urls):
    try:
        print(f'Creating directory to store {category} images')
        category_dir_path = data_path / category
        category_dir_path.mkdir(parents=True)
    except FileExistsError:
        print(f'{category_dir_path} directory exists, continuing...')
    except Exception as e:
        print(e)
    else:
        print(f'Succesfully created {category}/ directory')
    results = None
    print(f'Downloading images for category {category}')
    with ThreadPoolExecutor() as executor:
        executor.map(download_from_url, zip(repeat(category), urls), timeout=10)
    print(f'All images downloaded for category {category}')

try:
    data_path.mkdir()
except FileExistsError:
    print('data directory exists, continuing...')
for category, category_urls in urls.items():
    download_category_from_url(category, category_urls)
print('All images downloaded into data folder')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error
Downloading http://farm4.static.flickr.com/3039/3116548450_772f1386c9.jpg for category sandwich
Error
Downloading http://farm3.static.flickr.com/2344/2486252099_293df84b01.jpg for category sandwich
Downloading http://farm4.static.flickr.com/3139/3138331350_fd5db2604b.jpg for category sandwich
Error
Downloading http://farm3.static.flickr.com/2331/2081301065_fc9d4b73e4.jpg for category sandwich
Downloading http://www.crossandcompany.com/nicosdeli/nicosfood3.jpg for category sandwich
Downloading http://farm1.static.flickr.com/33/100953250_dc99551a28.jpg for category sandwich
Downloading http://farm4.static.flickr.com/3044/2930432569_0d74e126e2.jpg for category sandwich
Downloading http://farm4.static.flickr.com/3358/3502012576_12b205d08a.jpg for category sandwich
Downloading http://www.istockphoto.com/file_thumbview_approve/6381389/2/istockphoto_6381389-club-sandwich.jpg for category sandwich
Error
Downloading http://w

In [0]:
for category, urls in working_urls.items():
    print(f'{category}: {len(urls)}')
    # f=open(f'{category}.txt','w')
    # l1 = map(lambda x:x+'\n', urls)
    # f.writelines(l1)
    # f.close()

## Generación de datos

In [0]:
# The default version of imgaug doesn't correctly support loading a 1d numpy array to augmenters
!pip install imgaug==0.4.0

In [0]:
import numpy as np
import imgaug.augmenters as iaa
import cv2
import glob

In [0]:
def augment_data():
  """
  Generates new data based on downloaded images by applying left-to-right flip
  and Gaussian Blur.
  """
  print('Augmenting data by flipping and Gaussian Blur...')
  seq = iaa.Sequential([
    iaa.Fliplr(0.5),
    iaa.GaussianBlur(sigma=(0, 3.0))
  ])

  for category in categories:
    batch = []
    for filename in glob.iglob(f'{data_directory}/{category}/*'):
      print(f'Reading {filename}')
      try:
        im = cv2.imread(filename)
        cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        batch.append(im)
      except Exception as e:
        print(f'Error on image: {filename}, continuing...')
    images_aug = seq(images=np.array(batch))
    for image in images_aug:
      cv2.imwrite(f'{data_directory}/{category}/{uuid.uuid4()}.jpg', image)
      print(f'Artificial data saved for category {category}')
    print(f'=== {len(batch)} new images added to category {category} ===')

augment_data()

## Separar train, test y validate
### Separamos 80% train, 10% validate y 10% test para cada clase

In [0]:
import random
import shutil

In [0]:
try:
  os.mkdir(f'{data_directory}/train')
except FileExistsError as e:
  print('train directory already exists, continuing...')
try:
  os.mkdir(f'{data_directory}/test')
except FileExistsError as e:
  print('test directory already exists, continuing...')
try:
  os.mkdir(f'{data_directory}/valid')
except FileExistsError as e:
  print('valid directory already exists, continuing...')

In [0]:
for category in categories:
  try:
    os.mkdir(f'{data_directory}/train/{category}')
    os.mkdir(f'{data_directory}/valid/{category}')
    os.mkdir(f'{data_directory}/test/{category}')
  except FileExistsError as e:
    print(f'{category} directory already exists, continuing...')

  images = glob.glob(f'{data_directory}/{category}/*')
  for i in random.sample(images, int(len(images) * 0.8)):
    shutil.move(i, f'{data_directory}/train/{category}/')

  images = glob.glob(f'{data_directory}/{category}/*') # Get files again, since they were moved
  for i in random.sample(images, int(len(images) * 0.5)):
    shutil.move(i, f'{data_directory}/valid/{category}/')

  images = glob.glob(f'{data_directory}/{category}/*')
  for i in random.sample(images, int(len(images))):
    shutil.move(i, f'{data_directory}/test/{category}/')

  # ====== Ocasiona problemas si se utiliza con Drive =========
  # os.remove(f'{data_directory}/{category}')

# Aprendizaje

In [0]:
# Creamos la CNN
model = Sequential()
model.add(Conv2D(32, kernel_size=3, activation='relu', input_shape=(150, 150, 3)))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(64, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(128, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(256, kernel_size=3, activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

In [0]:
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

In [0]:
save_path = './drive/My Drive/food_classification/data/model/checkpoint'

In [0]:
def load_saved_model():
  loaded_model = None
  try:
    loaded_model = load_model(save_path)
  except Exception as e:
    print("An error ocurred while loading the model:")
    print(e)
  return loaded_model

# ======= Correr si ya se tiene un modelo parcialmente o completamente entrenado.
model = load_saved_model()

In [0]:
# image width
rows = 150
# image height
cols = 150 
channels = 3

X_train = []
y_train = []
X_val = []
y_val = []

def get_category_class(category):
  class_num = { 'wines': 0, 'bubble_gums': 1, 'dumplings': 2, 'pizza': 3, 'sandwich': 4}
  return class_num[category]

# Lee todos los nombres de las imagenes de un directorio y las guarda en una lista
def read_imgs_and_set_class(dir_name, category):
  path = '{}/{}/{}/'.format(data_directory, dir_name, category)
  img_filenames = ['{}{}'.format(path, name) for name in os.listdir(path)]
  print("Fetched {} image filenames for category {}".format(len(img_filenames), category))
  X = []
  y = []
  debug_counter = 0
  for image in img_filenames:
    try:
      X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (rows, cols), interpolation=cv2.INTER_CUBIC))
      y.append(get_category_class(category))
    except Exception as e:
      pass
    finally:
      debug_counter += 1
      if debug_counter % 150 == 0:
        print("X is {} size and y is {} size".format(len(X), len(y)))
  print("Loaded X ({} data points) and y ({} data points) [{}]".format(len(X), len(y), dir_name))
  return (X, y)

# Genera los datos para X_train y y_train
def create_data_sets():
  # Obtenemos la data de entrenamiento
  for category in categories:
    X, y = read_imgs_and_set_class('train', category)
    X_train.extend(X)
    y_train.extend(y)
  
  for category in categories:
    X, y = read_imgs_and_set_class('valid', category)
    X_val.extend(X)
    y_val.extend(y)
  
  print("Final X_train size: {}".format(len(X_train)))
  print("Final y_train size: {}".format(len(y_train)))

create_data_sets()

In [11]:
# Checamos de que clases agregamos data
print(set(y_train))
print(set(y_val))

{0, 1, 2, 3, 4}
{0, 1, 2, 3, 4}


In [0]:
# Copy de data por si se aplican transformaciones que no deve
X_copy = X_train
y_copy = y_train
X_val_copy = X_val
y_val_copy = y_val

In [0]:
# Np array transformations
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [0]:
# Convertimos la data a valores categoricos (para no tener 0,1,2,3,4)
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [15]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 15, 15, 256)      

## Entrenamiento

In [0]:
# Definimos la política para guardar a un modelo
model_checkpoint_callback = ModelCheckpoint(
    filepath=save_path,
    save_weights_only=False,
    monitor='val_acc',
    mode='max',
    save_best_only=False)

In [0]:
# Entrenamos al modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=61, callbacks=[model_checkpoint_callback])

# Probar el modelo

In [12]:
X_test = []
y_test = []

def create_test_data():
  # Obtenemos la data de entrenamiento
  for category in categories:
    X, y = read_imgs_and_set_class('test', category)
    X_test.extend(X)
    y_test.extend(y)

create_test_data()

X_test = np.array(X_test)
y_test = np.array(y_test)
y_test = to_categorical(y_test)

loss, acc = model.evaluate(X_test, y_test, verbose=2)
print('Restored model, accuracy: {:5.2f}%'.format(100*acc))

Fetched 145 image filenames for category wines
Loaded X (142 data points) and y (142 data points) [test]
Fetched 158 image filenames for category bubble_gums
X is 146 size and y is 146 size
Loaded X (154 data points) and y (154 data points) [test]
Fetched 183 image filenames for category dumplings
X is 147 size and y is 147 size
Loaded X (180 data points) and y (180 data points) [test]
Fetched 189 image filenames for category pizza
X is 148 size and y is 148 size
Loaded X (187 data points) and y (187 data points) [test]
Fetched 156 image filenames for category sandwich
X is 146 size and y is 146 size
Loaded X (152 data points) and y (152 data points) [test]
Restored model, accuracy: 78.40%
