# Deep Convolutional Neural Network for Art Classification with PyTorch
# Part 3: pre-processing

## Imports

In [178]:
import os
import torch
import torchvision
import tarfile
from torchvision.datasets.utils import download_url
from torch.utils.data import random_split
import PIL
from PIL import Image
import cv2
import pathlib
import glob
from pathlib import Path
import numpy as np
import shutil
import random
import pandas as pd

In [154]:
project_name='CNN_classifier'

In [155]:
path_str = '/Users/alexandreberkovic/Desktop/Year_4/Masters'

In [156]:
path = Path(path_str)

In [157]:
os.listdir(path)

['Research',
 '.DS_Store',
 'pulkit_singh_spring_2019.pdf',
 'classifier for art.pdf',
 'Dataset',
 'Master projects.xlsx',
 'Interim',
 'Repo']

In [158]:
# directory of image folders per mouvement
img_folders = Path(path_str+'/'+'Dataset/wikiart')

In [159]:
# remove DS_Store file
folders = list(os.listdir(img_folders))
folders.remove('.DS_Store')

## Modify the CSV file after modification of dataset

In [193]:
style_train = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/style_train.csv'), names = ['Path','Style'])
style_val = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/style_val.csv'), names = ['Path','Style'])

artist_train = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/artist_train.csv'), names = ['Path','Artist'])
artist_val = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/artist_val.csv'), names = ['Path','Artist'])
        
genre_train = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/genre_train.csv'), names = ['Path','Genre'])
genre_val = pd.read_csv(os.path.join(path_str,'Dataset/wikiart_csv/genre_val.csv'), names = ['Path','Genre'])

In [194]:
datasets = [style_train,style_val,artist_train,artist_val,genre_train,genre_val]

In [218]:
def replace_path(df):
    df['Path'] = df['Path'].replace({'Analytical_Cubism': 'Cubism'}, regex=True)
    df['Path'] = df['Path'].replace({'Synthetic_Cubism': 'Cubism'}, regex=True)
    df['Path'] = df['Path'].replace({'Action_painting': 'Abstract_Expressionism'}, regex=True)
    df['Path'] = df['Path'].replace({'New_Realism': 'Contemporary_realism'}, regex=True)
    df['Path'] = df['Path'].replace({'Color_Field_Painting': 'Minimalism'}, regex=True)

In [219]:
for i in range(len(datasets)):
    replace_path(datasets[i])

In [222]:
def replace_num(df):
    df['Style'] = df['Style'].replace({2:7}, regex=True)
    df['Style'] = df['Style'].replace({5:7}, regex=True)
    df['Style'] = df['Style'].replace({1:0}, regex=True)
    df['Style'] = df['Style'].replace({16:6}, regex=True)
    df['Style'] = df['Style'].replace({5:14}, regex=True)

In [223]:
replace_num(style_train)
replace_num(style_val)

In [227]:
style_train.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/style_train.csv'))
style_val.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/style_val.csv'))

artist_train.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/artist_train.csv'))
artist_val.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/artist_val.csv'))
        
genre_train.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/genre_train.csv'))
genre_val.to_csv(os.path.join(path_str,'Dataset/wikiart_csv/genre_val.csv'))

## Create a subset of the dataset to play with during the CNN

In [160]:
def subset(path):
    for i in range(len(folders)):
        dirpath = os.path.join(path,folders[i])
        directory_length = len(list(os.listdir(dirpath)))
        if directory_length < 2500:
            subset_length = 250
        elif directory_length > 12500:
            subset_length = 1250
        else:
            subset_length = int(0.1*len(list(os.listdir(dirpath))))
        filenames = random.sample(os.listdir(dirpath),subset_length)
        print('{} has {} images'.format(folders[i], subset_length))
#         print('Mouvement' + folders[i] "has" + str(directory_length) + 'images') 
        
        destDirectory = os.path.join(path_str,'Dataset/Dataset_subset/',folders[i])
        if not os.path.exists(destDirectory):
            os.makedirs(destDirectory)
            
        else:
            for f in os.listdir(destDirectory):
                os.remove(os.path.join(destDirectory, f))
       
        for fname in filenames:
            srcpath = os.path.join(dirpath, fname)
            shutil.copy(srcpath, destDirectory)

In [163]:
subset(img_folders)

Early_Renaissance has 250 images
Mannerism_Late_Renaissance has 250 images
Expressionism has 673 images
Contemporary_Realism has 250 images
Fauvism has 250 images
Northern_Renaissance has 255 images
Rococo has 250 images
Ukiyo_e has 250 images
Pop_Art has 250 images
High_Renaissance has 250 images
Minimalism has 291 images
Art_Nouveau_Modern has 433 images
Symbolism has 452 images
Realism has 1073 images
Romanticism has 701 images
Cubism has 256 images
Impressionism has 1250 images
Baroque has 424 images
Post_Impressionism has 645 images
Abstract_Expressionism has 287 images
Pointillism has 250 images
Naive_Art_Primitivism has 250 images


## Data exploration

### Image resizing functions

In [164]:
# directory of image folders per mouvement
subset_folders = os.path.join(path_str, 'Dataset/Dataset_subset')
# Path(path_str+'/'+'Dataset/Dataset_subset')

In [165]:
os.listdir(subset_folders)

['Early_Renaissance',
 'Mannerism_Late_Renaissance',
 'Expressionism',
 'Contemporary_Realism',
 'Fauvism',
 'Northern_Renaissance',
 'Rococo',
 'Ukiyo_e',
 'Pop_Art',
 'High_Renaissance',
 'Minimalism',
 'Art_Nouveau_Modern',
 'Symbolism',
 'Realism',
 'Romanticism',
 'Cubism',
 'Impressionism',
 'Baroque',
 'Post_Impressionism',
 'Abstract_Expressionism',
 'Pointillism',
 'Naive_Art_Primitivism']

In [173]:
def resize_upper(path,cnn_size):
    '''
    Resizes the images so that one side is 256 and the other is larger
    Crops it so that the output is 256x256
    '''
    for i in range(len(folders)):
        dirpath = os.path.join(path,folders[i])
        
        images = [file for file in os.listdir(dirpath) if file.endswith(('jpeg', 'png', 'jpg'))]
        name = folders[i]
        saving_dir = os.path.join(path_str,'Dataset/Resized_cropped',name)
        
        if not os.path.exists(saving_dir):
            os.makedirs(saving_dir)
            
        else:
            for f in os.listdir(saving_dir):
                os.remove(os.path.join(saving_dir, f))

        for image in images:
            img = Image.open(Path(str(subset_folders)+'/'+name+'/'+image))

            if img.size[0] >= img.size[1] and img.size[1] > cnn_size:

                fixed_height = cnn_size
                height_percent = (fixed_height / float(img.size[1]))
                width_size = int((float(img.size[0]) * float(height_percent)))
                img = img.resize((width_size, fixed_height), PIL.Image.NEAREST)
                cropped = crop(img)
                cropped.save(os.path.join(saving_dir,image), optimize=True, quality=100)

            elif img.size[0] < img.size[1] and img.size[0] > cnn_size:
                fixed_width = cnn_size
                width_percent = (fixed_width / float(img.size[0]))
                height_size = int((float(img.size[1]) * float(width_percent)))
                img = img.resize((fixed_width, height_size), PIL.Image.NEAREST)
                cropped = crop(img)
                cropped.save(os.path.join(saving_dir,image), optimize=True, quality=100)



In [174]:
def resize_lower(path,cnn_size):
    '''
    Resizes the images so that one side is 256 and the other is smaller
    Fills blank space with 0s so that the output is 256x256
    ''' 
    for i in range(len(folders)):
        dirpath = os.path.join(path,folders[i])
        
        images = [file for file in os.listdir(dirpath) if file.endswith(('jpeg', 'png', 'jpg'))]
        name = folders[i]
        saving_dir = os.path.join(path_str,'Dataset/Resized_blank',name)

        if not os.path.exists(saving_dir):
            os.makedirs(saving_dir)
        
        else:
            for f in os.listdir(saving_dir):
                os.remove(os.path.join(saving_dir, f))

        for image in images:
            img = Image.open(Path(str(subset_folders)+'/'+name+'/'+image))
        
            if img.size[1] >= img.size[0] and img.size[0] > cnn_size:

                fixed_height = cnn_size
                height_percent = (fixed_height / float(img.size[1]))
                width_size = int((float(img.size[0]) * float(height_percent)))
                img = img.resize((width_size, fixed_height), PIL.Image.NEAREST)
                filled = fill(img,(0, 0, 0))
                filled.save(os.path.join(saving_dir,image), optimize=True, quality=100)

            elif img.size[1] < img.size[0] and img.size[1] > cnn_size:
                fixed_width = cnn_size
                width_percent = (fixed_width / float(img.size[0]))
                height_size = int((float(img.size[1]) * float(width_percent)))
                img = img.resize((fixed_width, height_size), PIL.Image.NEAREST)
                filled = fill(img,(0, 0, 0))
                filled.save(os.path.join(saving_dir,image), optimize=True, quality=100)



In [175]:
def resize_compress(path,cnn_size):
    '''
    Resizes the images by compressing them
    Output is 256x256
    ''' 
    for i in range(len(folders)):
        dirpath = os.path.join(path,folders[i])
        
        images = [file for file in os.listdir(dirpath) if file.endswith(('jpeg', 'png', 'jpg'))]
        name = folders[i]
        saving_dir = os.path.join(path_str,'Dataset/Resized_compressed',name)

        if not os.path.exists(saving_dir):
            os.makedirs(saving_dir)
            
        else:
            for f in os.listdir(saving_dir):
                os.remove(os.path.join(saving_dir, f))

        for image in images:
            img = Image.open(Path(str(subset_folders)+'/'+name+'/'+image))

            if img.size[0] > cnn_size and img.size[1] > cnn_size:
                resized_image = img.resize((256,256))
                resized_image.save(os.path.join(saving_dir,image), optimize=True, quality=100)
            else:
                pass

### Helper functions used after resizing to uniform image dimensions

In [169]:
def crop(im):
    '''
    Crops the image when one side is 256 and the other is bigger
    Outputs a 256x256 centred image
    '''
    # Opens a image in RGB mode
#     im = Image.open(r"C:\Users\Admin\Pictures\network.png")
 
    # Setting the points for cropped image
    width, height = im.size
    if width == 256:
        left = 0
        right = 256
        top = height//2 - 128
        bottom = height//2 + 128

    elif height == 256:
        left = width//2 - 128
        right = width//2 + 128
        top = 0
        bottom = 256
        
    # Cropped image of above dimension
    # (It will not change original image)
    im1 = im.crop((left, top, right, bottom))
    return im1

In [170]:
def fill(pil_img, background_color):
    '''
    Fills the image with 0s when one side is 256 and the other is smaller
    Outputs a 256x256 centred image
    '''
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, ((height - width) // 2, 0))
        return result

### Creating the usable dataset

In [172]:
# resize_upper(subset_folders,256)

In [176]:
# resize_lower(subset_folders,256)

In [177]:
# resize_compress(subset_folders,256)