# Dataset Preparation



*   load data from exisiting WikiArt dataset
*   use the first 1000 paintings as data in the database
*   create data augmentation pipeline and generate dataset for painting identification task





In [None]:
!pip install datasets

In [2]:
from datasets import load_dataset

dataset = load_dataset("Artificio/WikiArt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/663 [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/426M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/428M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/429M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/429M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/103250 [00:00<?, ? examples/s]

In [3]:
image_data = []
for i in range(1000):
    entry = dataset['train'][i]  # Adjust dataset split if necessary
    image_filename = f"{i}_{entry['title'].replace(' ', '_').replace('/', '_')}_{entry['artist'].replace(' ', '_').replace('/', '_')}_{entry['date']}.jpg"
    title = f"{entry['title'].replace(' ', '_').replace('/', '_')}_{entry['artist'].replace(' ', '_').replace('/', '_')}_{entry['date']}"
    # image_path = os.path.join(images_dir, image_filename)
    # entry['image'].save(image_path)  # Save image

    # Append data for CSV
    image_data.append({'Index': i, 'Filename': image_filename, 'Title': title})

In [4]:
all_titles = [entry['Title'] for entry in image_data]
from collections import Counter
freq = Counter(all_titles)
for key in freq:
  if freq[key] > 1:
    print(key, freq[key])

Don_Quixote_Gustave_Dore_None 3
None_han_van_meegeren_None 3
Untitled_Zdislav_Beksinski_None 5
Illustration_to_"A_Week_of_Kindness"_Max_Ernst_1934 3


In [5]:
from torchvision import transforms

sim_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=5),
    transforms.ColorJitter(brightness=0.6, contrast=0.5, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(size=224, scale=(0.95, 1.0)),
    transforms.GaussianBlur(11, sigma=(0.01, 1)),
    transforms.ToTensor(),
])

In [6]:
import matplotlib.pyplot as plt
from PIL import Image
import io
import pandas as pd
import os

In [7]:
images_dir = '/content/data'
os.makedirs(images_dir, exist_ok=True)

processed_titles = set()
data = []

for i in range(1000):
    entry = dataset['train'][i]  # Adjust dataset split if necessary
    title = f"{entry['title']}_{entry['artist']}_{entry['date']}".replace(' ', '_').replace('/', '_')

    if title in processed_titles:
      continue
    processed_titles.add(title)

    painting_dir = os.path.join(images_dir, title)
    os.makedirs(painting_dir, exist_ok=True)

    image_filename = f"{title}_original.jpg"
    image_path = os.path.join(painting_dir, image_filename)
    entry['image'].save(image_path)  # Save image

    # Append data for CSV
    data.append({'Filename': image_filename, 'Title': title})

    img = Image.open(image_path).convert("RGB")
    for j in range(20):
      augmented_filename = f"{title}_augmented_{j+1}.jpg"
      augmented_image_path = os.path.join(painting_dir, augmented_filename)
      transformed_img = sim_transform(img)
      augmented_img = transforms.ToPILImage()(transformed_img)
      augmented_img.save(augmented_image_path)
      data.append({'Filename': augmented_filename, 'Title': title})

In [10]:
from google.colab import drive
drive.mount('/content/drive')

import shutil

# Specify the path to the directory you want to zip
directory_to_zip = '/content/data'

# Specify the output zip file path (including the path within Google Drive)
zip_output_path = '/content/drive/My Drive/painting_identification_data.zip'

# Creating a zip archive of the directory
shutil.make_archive(zip_output_path, 'zip', directory_to_zip)

if os.path.exists(zip_output_path + '.zip'):
    print('Zip file created successfully!')
else:
    print('Failed to create zip file.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Zip file created successfully!


In [13]:
title_to_idx = {}
id = 0

for t in processed_titles:
  title_to_idx[t] = id
  id += 1

In [None]:
import pandas as pd

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)

# Map the titles to indices and create a new column for the label
df['Label'] = df['Title'].map(title_to_idx)

In [20]:
df['Filepath'] = df.apply(lambda row: f"{images_dir}/{row['Title']}/{row['Filename']}", axis=1)

In [33]:
csv_file_path = '/content/drive/My Drive/painting_identification_data.csv'
current_path = '/content/painting_identification_data.csv'
df.to_csv(current_path, index=False)