In [91]:
import os
import numpy as np
import pandas as pd

random_state = 42

# Create csv file with all paintings manually

In [92]:
data_folder_name = 'wikiart'

styles = os.listdir(data_folder_name)
style_folder_paths = [os.path.join(data_folder_name, style_name) for style_name in styles]
painting_paths = [{
        'file_path': os.path.join(folder_path, painting_file), 
        'style': folder_path.replace('wikiart/','').replace('_', ' ')
    } 
    for folder_path in style_folder_paths for painting_file in os.listdir(folder_path)]

df = pd.DataFrame(painting_paths)
df['style'] = df['style'].astype('category')
df['style_id'] = df['style'].cat.codes + 1
df

Unnamed: 0,file_path,style,style_id
0,wikiart/Early_Renaissance/filippo-lippi_two-sa...,Early Renaissance,9
1,wikiart/Early_Renaissance/filippo-lippi_madonn...,Early Renaissance,9
2,wikiart/Early_Renaissance/piero-della-francesc...,Early Renaissance,9
3,wikiart/Early_Renaissance/pietro-perugino_chri...,Early Renaissance,9
4,wikiart/Early_Renaissance/antonello-da-messina...,Early Renaissance,9
...,...,...,...
81441,wikiart/Naive_Art_Primitivism/david-burliuk_a-...,Naive Art Primitivism,16
81442,wikiart/Naive_Art_Primitivism/marc-chagall_rus...,Naive Art Primitivism,16
81443,wikiart/Naive_Art_Primitivism/niko-pirosmani_t...,Naive Art Primitivism,16
81444,wikiart/Naive_Art_Primitivism/niko-pirosmani_w...,Naive Art Primitivism,16


In [93]:
if not os.path.exists('data'):
    os.mkdir('data')
df.to_csv('data/full_data.csv', index=False)

# Splitting Data

In [94]:
df = pd.read_csv('data/full_data.csv')
df

Unnamed: 0,file_path,style,style_id
0,wikiart/Early_Renaissance/filippo-lippi_two-sa...,Early Renaissance,9
1,wikiart/Early_Renaissance/filippo-lippi_madonn...,Early Renaissance,9
2,wikiart/Early_Renaissance/piero-della-francesc...,Early Renaissance,9
3,wikiart/Early_Renaissance/pietro-perugino_chri...,Early Renaissance,9
4,wikiart/Early_Renaissance/antonello-da-messina...,Early Renaissance,9
...,...,...,...
81441,wikiart/Naive_Art_Primitivism/david-burliuk_a-...,Naive Art Primitivism,16
81442,wikiart/Naive_Art_Primitivism/marc-chagall_rus...,Naive Art Primitivism,16
81443,wikiart/Naive_Art_Primitivism/niko-pirosmani_t...,Naive Art Primitivism,16
81444,wikiart/Naive_Art_Primitivism/niko-pirosmani_w...,Naive Art Primitivism,16


### Sample n from Each Style

In [95]:
n = 50

sample_df = df.groupby('style_id').sample(n, random_state=random_state)
sample_df.to_csv('data/sampled_data.csv', index=False)
sample_df

Unnamed: 0,file_path,style,style_id
77334,wikiart/Abstract_Expressionism/bradley-walker-...,Abstract Expressionism,1
77694,wikiart/Abstract_Expressionism/henri-matisse_b...,Abstract Expressionism,1
76986,wikiart/Abstract_Expressionism/bui-xuan-phai_a...,Abstract Expressionism,1
77010,wikiart/Abstract_Expressionism/cy-twombly_unti...,Abstract Expressionism,1
77901,wikiart/Abstract_Expressionism/conrad-marca-re...,Abstract Expressionism,1
...,...,...,...
16534,wikiart/Ukiyo_e/katsushika-hokusai_street-scen...,Ukiyo e,27
16503,wikiart/Ukiyo_e/hiroshige_komokata-hall-and-az...,Ukiyo e,27
16017,wikiart/Ukiyo_e/hiroshige_kameyama.jpg,Ukiyo e,27
16228,wikiart/Ukiyo_e/utagawa-kuniyoshi_women-2.jpg,Ukiyo e,27


# Split into train, val, test

In [96]:
train_size = int(len(sample_df.index)*0.6)
val_size = int(len(sample_df.index)*0.2)
test_size = int(len(sample_df.index)*0.2)

In [97]:
split_df = sample_df.sample(frac=1, random_state=random_state)

train = split_df.head(train_size)
split_df = split_df.tail(-train_size)

val = split_df.head(val_size)
split_df = split_df.tail(-val_size)

test = split_df.head(test_size)
split_df = split_df.tail(-test_size)

split_df

Unnamed: 0,file_path,style,style_id


In [98]:
train

Unnamed: 0,file_path,style,style_id
26514,wikiart/Color_Field_Painting/mario-cesariny_se...,Color Field Painting,6
72776,wikiart/Post_Impressionism/vincent-van-gogh_th...,Post Impressionism,21
10184,wikiart/Fauvism/august-macke_turkish-cafe-i.jpg,Fauvism,11
9673,wikiart/Contemporary_Realism/john-miller_calan...,Contemporary Realism,7
33304,wikiart/Realism/guntis-strupulis_a-k-avi-a-por...,Realism,22
...,...,...,...
18503,wikiart/High_Renaissance/luca-signorelli_ovid-...,High Renaissance,12
78887,wikiart/Synthetic_Cubism/juan-gris_the-coffee-...,Synthetic Cubism,26
15370,wikiart/Rococo/thomas-gainsborough_an-unknown-...,Rococo,23
16664,wikiart/Ukiyo_e/utagawa-kuniyoshi_scrbbling-on...,Ukiyo e,27


In [99]:
train.to_csv('data/train.csv', index=False)
val.to_csv('data/val.csv', index=False)
test.to_csv('data/test.csv', index=False)