# Deep Convolutional Neural Network for Art Classification with PyTorch
# Part 2: Dataset (ETL)

## Imports

In [1]:
import os
import shutil
import torch
import torchvision
import tarfile
from torchvision.datasets.utils import download_url
from torch.utils.data import random_split
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
import PIL
from PIL import Image
import cv2
import pathlib
import glob
from pathlib import Path
import numpy as np
import shutil
import random
import pandas as pd
import argparse
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
# import PyTorch and its related packages
import torch as T
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms

# set default device based on CUDA's availability
device = 'cuda' if T.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
project_name='CNN_classifier'

In [9]:
# path_str = '/Users/alexandreberkovic/Desktop/Year_4/Masters'
path_str = '/home/ec2-user/SageMaker/wikiart_binary'

In [10]:
path = Path(path_str)

In [11]:
# directory of image folders per mouvement
# img_folders = Path(path_str+'/'+'Dataset/wikiart')
img_folders = path

In [12]:
# remove DS_Store file
folders = list(os.listdir(img_folders))
# folders.remove('.DS_Store')

In [13]:
folders

['.ipynb_checkpoints', 'test', 'train']

## Training and validation sets

In [26]:
def get_files_from_folder(path):

    files = os.listdir(path)
    return np.asarray(files)

In [27]:
def split_data(path_to_data, path_to_test_data, train_ratio):
    # get dirs
    _, dirs, _ = next(os.walk(path_to_data))

    # calculates how many train data per class
    data_counter_per_class = np.zeros((len(dirs)))
    for i in range(len(dirs)):
        path = os.path.join(path_to_data, dirs[i])
        files = get_files_from_folder(path)
        data_counter_per_class[i] = len(files)
    test_counter = np.round(data_counter_per_class * (1 - train_ratio))

    # transfers files
    for i in range(len(dirs)):
        path_to_original = os.path.join(path_to_data, dirs[i])
        path_to_save = os.path.join(path_to_test_data, dirs[i])

        #creates dir
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)
        files = get_files_from_folder(path_to_original)
        # moves data
        for j in range(int(test_counter[i])):
            dst = os.path.join(path_to_save, files[j])
            src = os.path.join(path_to_original, files[j])
            shutil.move(src, dst)

In [28]:
def parse_args():
    '''
    Parser for command line
    '''
    parser = argparse.ArgumentParser(description="Dataset divider")
    parser.add_argument("--data_path", required=True, help="Path to data")
    parser.add_argument("--test_data_path_to_save", required=True, help="Path to test data where to save")
    parser.add_argument("--train_ratio", required=True, help="Train ratio - 0.7 means splitting data in 70 % train and 30 % test")
    return parser.parse_args()

In [29]:
def clc_function(data_path,test_data_path_to_save,train_ratio):
    '''
    Function for command line
    '''
    args = parse_args()
    split_data(args.data_path, args.test_data_path_to_save, float(args.train_ratio))

In [32]:
# data_path = os.path.join(path_str,'Resized_cropped_224')
# test_data_path_to_save = os.path.join(path_str,'Resized_cropped_224_data')
data_path = os.path.join(path_str)
test_data_path_to_save = os.path.join('Resized_cropped_224_test')
train_ratio = 0.33

In [33]:
# split_data(data_path,test_data_path_to_save,train_ratio)



## Data Exploration

In [34]:
# strating with blank resized images
data_dir = os.path.join('/home/ec2-user/SageMaker/Resized_cropped_224')

print(os.listdir(data_dir))
classes = os.listdir(data_dir + "/train")
print(classes)

['test', 'train']
['Pointillism', 'Contemporary_Realism', 'High_Renaissance', 'Cubism', 'Post_Impressionism', 'Impressionism', 'Fauvism', 'Minimalism', 'Pop_Art', 'Expressionism', 'Baroque', 'Abstract_Expressionism', 'Realism', 'Naive_Art_Primitivism']


# 7-class folder

In [30]:
# define the two subclasses for the binary cnn
class1 = ['Pointillism', 'High_Renaissance', 'Post_Impressionism', 'Impressionism', 'Baroque', 'Realism']
class2 = ['Contemporary_Realism', 'Cubism', 'Fauvism', 'Minimalism', 'Pop_Art', 'Expressionism', 'Abstract_Expressionism', 'Naive_Art_Primitivism']

In [4]:
def new_folder(folders, path_in, path_out,name):
    test_train = ['test', 'train']
    for i in range(len(test_train)):    
        for j in range(len(folders)):
            
#             if str(folders) == 'class1':
#                 dirclass = 'class1'
#             elif str(folders) == 'class2' :
#                 dirclass = 'class2'
            # need to define loop for test and train
            dirpath = os.path.join(path_in, test_train[i], folders[j])
            images = [file for file in os.listdir(dirpath) if file.endswith(('jpeg', 'png', 'jpg'))]
            saving_dir = os.path.join(path_out, name, test_train[i], folders[j])

            if not os.path.exists(saving_dir):
                os.makedirs(saving_dir)

            else:
                for f in os.listdir(saving_dir):
                    os.remove(os.path.join(saving_dir, f))
                    
            for image in images:
                img = Image.open(os.path.join(dirpath,image))
                img.save(os.path.join(saving_dir,image), optimize=True, quality=100)

In [10]:
new_folder(class2, 'wikiart_cropped', 'wikiart_binary','post_1910s')

In [3]:
def delete_dir(path):
    shutil.rmtree(path)

In [28]:
def move(path_in, path_out, delete):
    images = [file for file in os.listdir(path_in) if file.endswith(('jpeg', 'png', 'jpg'))]
    for image in images:
        shutil.move(os.path.join(path_in,image), os.path.join(path_out,image))
    if delete == 'yes':
        delete_dir(path_in)
    else:
        pass

In [4]:
# delete_dir('checkpoints')

# Binary classifier

In [26]:
path1 = 'wikiart_cropped/pre_1910s/train/Post_Impressionism'
path2 = 'wikiart_cropped/pre_1910s/train'

In [31]:
for i in range(len(class1)):
    path1 = 'wikiart_cropped/pre_1910s/train/'+str(class1[i])
    path2 = 'wikiart_cropped/pre_1910s/train'
#     move(path1, path2, 'yes')

In [37]:
def copy_files(path_in, path_out):

    images = [file for file in os.listdir(path_in) if file.endswith(('jpeg', 'png', 'jpg'))]
    for image in images:

        # copying the files to the
        # destination directory
        shutil.copy2(os.path.join(path_in,image), os.path.join(path_out,image))


In [44]:
for i in range(len(class2)):
    path1 = 'wikiart_binary/post_1910s/test/'+str(class2[i])
    path2 = 'wikiart_cropped/post_1910s/test'
    copy_files(path1, path2)

In [45]:
# delete_dir('Cropped_models')