# Aplicação de CNNs para a classificação multi-label de peças de roupa

#### Descrição

Projeto final da disciplina [EEL7513-09202|EEL7514-08235|EEL510417-41000056ME/DO (20192) - Introdução ao Aprendizado de Máquina](https://moodle.ufsc.br/course/view.php?id=110125).

#### Equipe

- Kauê Cano
- Ruan Cardoso Comelli

## Inicialização

In [0]:
user = 'ruan.comelli@lepten.ufsc.br'

In [2]:
from pathlib import Path

from google.colab import drive

drive_path = Path('/content/drive')

drive.mount(str(drive_path))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
if user in {'ruan.comelli@lepten.ufsc.br', 'ruancomelli@gmail.com', 'rugortal@gmail.com'}:
    project_path = (
        drive_path
        / 'My Drive'
        / 'Studies'
        / '2019.3'
        / 'EEL510417 - Tópicos Especiais em Processamento de Sinais - Introdução ao Aprendizado de Máquina'
        / 'Final Project'
        / 'Codes'
    )
datasets_path = project_path / 'datasets'
deepfashion_path = datasets_path / 'DeepFashion'
fashion550k_path = datasets_path / 'Fashion550k'

models_path = project_path / 'models'
models_path.mkdir(parents=True, exist_ok=True)
log_path = project_path / 'log'
log_path.mkdir(parents=True, exist_ok=True)

In [0]:
import sys
sys.path.append(str(project_path)) # this allows us to import modules defined locally

## Definições

### Impressão e formatação

In [0]:
def print_header(
    s: str,
    level: int = 0, 
    levels=['=', '-', '~', '*']
):
    """Standardized method for printing a section header.

    Prints the argument s underlined.
    
    Parameters
    ----------
    s      : string to be printed
    level  : index of level symbol to be used
    levels : list of level symbols to choose from
    """
    print()
    print(s)
    print(levels[level] * len(s))

### Caminhos

In [0]:
def relative_path(origin, destination):
    from os.path import relpath
    return relpath(destination, start=origin)

### Containers

In [0]:
import pandas as pd
import numpy as np
from pathlib import Path

def missing_elements(int_list): # source: adapted from <https://stackoverflow.com/questions/16974047/efficient-way-to-find-missing-elements-in-an-integer-sequence>
    int_list = sorted(int_list)
    if int_list:
        start, end = int_list[0], int_list[-1]
        full_list = set(range(start, end + 1))
        return sorted(full_list.difference(int_list))
    else:
        return set([])
    
def merge_dicts(*dict_args):
    """
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    """
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

def extract_value(dicts, key, default_behaviour='value', default=None):
    if isinstance(dicts, dict):
        dicts = [dicts]
    
    for d in dicts:
        if key in d:
            return d[key]
        
    default_behaviour = default_behaviour.lower()
        
    if default_behaviour == 'value':
        return default
    elif default_behaviour == 'raise':
        raise ValueError(f'key {key} was not found in the dictionaries.')
    else:
        raise ValueError(f'default_behaviour must be either \'value\' or \'raise\'. Got \'{default_behaviour}\'.')

def extract_values(dicts, keys, default_behaviour='value', default=None):
    if isinstance(dicts, dict):
        dicts = [dicts]
    
    return {
        key: extract_value(dicts, key, default_behaviour='value', default=default)
        for key in keys
    }

## Configuração

In [0]:
DATA_FORMAT = 'channels_last'
IMG_SIZE = (224, 224)
IMG_SHAPE = (
    IMG_SIZE + (3,)
    if DATA_FORMAT == 'channels_last'
    else (3,) + IMG_SIZE
)
BATCH_SIZE = 128
RESCALE = 1./255
FILL_MODE = 'nearest'

validate_filenames = True

augment_previously = True
fill_aug_imgs = True

# DeepFashion

## Preparação

### Importar dados

In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

category_attribute_prediction_path = (deepfashion_path / 'Category and Attribute Prediction Benchmark').absolute().resolve()
images_path = (category_attribute_prediction_path / 'Img').absolute().resolve()
augmented_imgs_path = (category_attribute_prediction_path / 'Img' / 'aug_img').absolute().resolve()
annotations_path = (category_attribute_prediction_path / 'Anno').absolute().resolve()
evaluation_path = (category_attribute_prediction_path / 'Eval').absolute().resolve()

dataset_anno_path = annotations_path / 'dataset.csv'
dataset_path = (category_attribute_prediction_path / 'Img' / 'dataset').absolute().resolve()
dataset_path.mkdir(parents=True, exist_ok=True)

# n_categories = int(pd.read_csv(annotations_path / 'list_category_cloth.txt', nrows=1, header=None)[0][0])
# categories = pd.read_csv(annotations_path / 'list_category_cloth.txt', skiprows=1, delimiter=r"\s\s+", engine='python')
# categories = categories.astype({
#     'category_name': str,
#     'category_type': int
# })
# category_types = {
#     1: 'upper-body clothes',
#     2: 'lower-body clothes',
#     3: 'full-body clothes'
# }

# n_category_imgs = int(pd.read_csv(annotations_path / 'list_category_img.txt', nrows=1, header=None)[0][0])
# category_imgs = pd.read_csv(annotations_path / 'list_category_img.txt', skiprows=1, delim_whitespace=True)
# category_imgs = category_imgs.astype({
#     'image_name': str,
#     'category_label': int
# })

n_attributes = int(pd.read_csv(annotations_path / 'list_attr_cloth.txt', nrows=1, header=None)[0][0])
attributes = pd.read_csv(annotations_path / 'list_attr_cloth.txt', skiprows=1, delimiter=r"\s\s+", engine='python')
attributes = attributes.astype({
    'attribute_name': str,
    'attribute_type': int
})
attribute_types = {
    1: 'texture-related attributes',
    2: 'fabric-related attributes',
    3: 'shape-related attributes',
    4: 'part-related attributes',
    5: 'style-related attributes'
}

attr_binarizer = MultiLabelBinarizer()
attr_binarizer.fit([attributes['attribute_name']])
n_labels = attr_binarizer.classes_.size
print(n_labels, 'classes found:', attr_binarizer.classes_)

1000 classes found: ['a-line' 'abstract' 'abstract chevron' 'abstract chevron print'
 'abstract diamond' 'abstract floral' 'abstract floral print'
 'abstract geo' 'abstract geo print' 'abstract paisley' 'abstract pattern'
 'abstract print' 'abstract printed' 'abstract stripe' 'acid' 'acid wash'
 'americana' 'angeles' 'animal' 'animal print' 'ankle' 'applique'
 'arrow collar' 'art' 'asymmetric' 'asymmetrical' 'asymmetrical hem'
 'athletic' 'audrey' 'babe' 'babydoll' 'back bow' 'back cutout'
 'back knit' 'back lace' 'back striped' 'backless' 'baja' 'bandage'
 'bandana' 'bandana print' 'barbie' 'baroque' 'baroque print' 'baseball'
 'basic' 'basquiat' 'batwing' 'beach' 'bead' 'beaded' 'beaded chiffon'
 'beaded collar' 'beaded sheer' 'beaded shift' 'beatles' 'bed' 'bejeweled'
 'bell' 'bell-sleeve' 'bella' 'belted' 'belted chiffon' 'belted floral'
 'belted floral print' 'belted lace' 'belted maxi' 'belted plaid'
 'bermuda' 'bib' 'big' 'bike' 'biker' 'bird' 'bird print' 'blah' 'bleach'
 'blea

### Data augmentation

In [0]:
from ast import literal_eval
from tensorflow.keras.preprocessing.image import save_img
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.image import ImageDataGenerator

n_attribute_imgs = int(pd.read_csv(annotations_path / 'list_attr_img.txt', nrows=1, header=None)[0][0])
column_names = pd.read_csv(annotations_path / 'list_attr_img.txt', skiprows=1, nrows=1, delim_whitespace=True, header=None).values[0]

previously_augmented = False
if augment_previously:
    attribute_aug_imgs_path = annotations_path / 'attribute_aug_imgs.txt'
    try:
        print('Trying to read augmented image attributes', end='')
        attribute_aug_imgs = pd.read_csv(attribute_aug_imgs_path)
        attribute_aug_imgs[column_names[1]] = attribute_aug_imgs[column_names[1]].apply(lambda x: list(y for y in literal_eval(x)))

        attribute_aug_imgs = attribute_aug_imgs.astype({
            column_names[0]: str,
            column_names[1]: object
        })

        print(' - Done')
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(' - File not found... augmenting images later')
        attribute_aug_imgs = pd.DataFrame(columns=column_names)
        attribute_aug_imgs.to_csv(attribute_aug_imgs_path, index=False)

    n_attribute_aug_imgs = len(attribute_aug_imgs.index)
    if fill_aug_imgs:
        previously_augmented = (n_attribute_aug_imgs == n_attribute_imgs)
    else:
        previously_augmented = True

if not augment_previously or not previously_augmented:
    attribute_imgs_path = annotations_path / 'attribute_imgs.txt'
    try:
        print('Trying to read image attributes', end='')
        attribute_imgs = pd.read_csv(attribute_imgs_path)
        attribute_imgs[column_names[1]] = attribute_imgs[column_names[1]].apply(lambda x: list(y for y in literal_eval(x)))
        print(' - Done')
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(' - File not found... reading in chunks from bigger file')
        first_row = pd.read_csv(annotations_path / 'list_attr_img.txt', skiprows=2, nrows=1, delim_whitespace=True, header=None).values[0]
        attribute_imgs = pd.DataFrame(columns=column_names)
        for i, chunk in enumerate(pd.read_csv(
                annotations_path / 'list_attr_img.txt',
                skiprows=2,
                delim_whitespace=True,
                header=None,
                chunksize=50000,
                converters={
                    col: lambda x: True if x == '1' else False if x == '-1' else x
                    for col in range(1, len(first_row))
                }
        )):
            print(f'Reading chunk #{i}', end='')

            chunk = list(chunk.itertuples())
            y_values = [
                list(
                    idx
                    for idx, value in enumerate(row[2:])
                    if value
                ) for row in chunk
            ]
            chunk = pd.DataFrame({
                column_names[0]: [images_path / row[1] for row in chunk],
                column_names[1]: [[attributes['attribute_name'][y_] for y_ in y_value] if y_value else ['none'] for y_value in y_values]
            }, index=[row[0] for row in chunk])

            attribute_imgs = attribute_imgs.append(chunk, verify_integrity=True, sort=False)

            print(f' - Done')
        try:
            print('Trying to write attribute images', end='')
            attribute_imgs.to_csv(attribute_imgs_path, index=False)
            print(' - Done')
        except PermissionError:
            print(' - Permission denied')

    attribute_imgs = attribute_imgs.astype({
        column_names[0]: str,
        column_names[1]: object
    })

expanded_attr_binarizer = MultiLabelBinarizer()
if not augment_previously or not previously_augmented:
    expanded_attr_binarizer.fit(attribute_imgs[column_names[1]])
else:
    expanded_attr_binarizer.fit(attribute_aug_imgs[column_names[1]])

if augment_previously and not previously_augmented:
    print('Augmenting images')

    image_data_generator = ImageDataGenerator(
        rescale=RESCALE,
        data_format=DATA_FORMAT,
        fill_mode=FILL_MODE,
    )
    augmented_imgs_path.mkdir(parents=True, exist_ok=True)

    dataframe_iterator = image_data_generator.flow_from_dataframe(
        attribute_imgs[n_attribute_aug_imgs:],
        x_col=column_names[0],
        y_col=column_names[1],
        color_mode='rgb',
        classes=list(expanded_attr_binarizer.classes_),
        class_mode='categorical',
        validate_filenames=validate_filenames,
        shuffle=False,
        target_size=IMG_SIZE,
        batch_size=BATCH_SIZE
    )
    counter = n_attribute_aug_imgs
    break_all = False
    for batch in dataframe_iterator:
        print_header(f'Batch {dataframe_iterator.batch_index}')
        for idx, (X, y) in enumerate(zip(*batch)):
            y = np.expand_dims(y, axis=0)
            inv_y = expanded_attr_binarizer.inverse_transform(y)
            
            origin = Path(dataframe_iterator.filenames[idx])
            file_path = augmented_imgs_path / origin.parts[-2] / origin.parts[-1]
            
            print(f'Index = {counter}/{n_attribute_imgs} [#{idx} from batch {dataframe_iterator.batch_index}]')
            print(f'Origin = {origin}')
            print(f'y = {inv_y} [path = {file_path}]')

            file_path.parent.mkdir(parents=True, exist_ok=True)
            if not file_path.exists():
                save_img(file_path, X)
            attribute_aug_imgs = attribute_aug_imgs.append({
                column_names[0]: str(file_path),
                column_names[1]: inv_y[0]
            }, ignore_index=True)

            counter += 1
            if counter == n_attribute_imgs:
                break_all = True
                break

        attribute_aug_imgs.to_csv(attribute_aug_imgs_path, index=False)
        if break_all:
            break

    previously_augmented = True
    print('Done')

Trying to read augmented image attributes - Done
Trying to read image attributes - Done
Augmenting images


### Informação

In [0]:
# print_header('Categories')
# print(f'n_categories = {n_categories}')
# categories.info()
# categories.head()
# print('Category types:', category_types)

# print_header('Category images')
# print(f'n_category_imgs = {n_category_imgs}')
# category_imgs.info()
# category_imgs.head()

print_header('Attributes')
print(f'n_attributes = {n_attributes}')
attributes.info()
attributes.head()
print('Category types:', attribute_types)

if not augment_previously or not previously_augmented:
    print_header('Attribute images')
    print(f'n_attribute_imgs = {n_attribute_imgs}')
    attribute_imgs.info()
    attribute_imgs.head()

if previously_augmented:
    print_header('Augmented Attribute images')
    print(f'n_attribute_imgs = {n_attribute_imgs}')
    attribute_aug_imgs.info()
    attribute_aug_imgs.head()