In [1]:
import shutil
import pandas as pd
from pathlib import Path

# Prepare final dataset

In [2]:
CUR_DIR: Path = Path(".").absolute()
DATA_DIR: Path = CUR_DIR / "data"
FINAL_DATASET_DIR: Path = DATA_DIR / "dataset"

shutil.rmtree(FINAL_DATASET_DIR)
FINAL_DATASET_DIR.mkdir(parents=True)

In [3]:
final_train_dir: Path = FINAL_DATASET_DIR / "train"
final_train_dir.mkdir(parents=True, exist_ok=True)

final_val_dir: Path = FINAL_DATASET_DIR / "val"
final_val_dir.mkdir(parents=True, exist_ok=True)

final_train_df_path: Path = FINAL_DATASET_DIR / "train.csv"
final_val_df_path: Path = FINAL_DATASET_DIR / "val.csv"

final_train_df: pd.DataFrame = pd.DataFrame()
final_val_df: pd.DataFrame = pd.DataFrame()

train_frac: float = 0.85

# Parse orig dataset

In [4]:
ORIG_DATASET_DIR: Path = DATA_DIR / "orig"

orig_df_path: Path = ORIG_DATASET_DIR / "artDataset.csv"
orig_img_dir: Path = ORIG_DATASET_DIR / "artDataset"

orig_df: pd.DataFrame = pd.read_csv(orig_df_path)
orig_df.drop(columns=orig_df.columns[0], inplace=True)
orig_df["price"] = orig_df["price"].map(lambda x: int(x.strip(" USD").replace(".", "")))
orig_df["image"] = [f'image_{i}.png' for i in range(1, orig_df.shape[0] + 1)]

print(orig_df.shape)
orig_df.head()

(754, 9)


Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,image
0,28500,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,image_1.png
1,3000,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,image_2.png
2,5000,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,image_3.png
3,5000,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,image_4.png
4,2500,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,image_5.png


In [5]:
orig_size: int = orig_df.shape[0]
orig_train_size: int = int(orig_size * train_frac)
orig_val_size: int = orig_size - orig_train_size
print(orig_size, orig_train_size, orig_val_size)

754 640 114


In [6]:
train_rows: list[pd.Series] = []
val_rows: list[pd.Series] = []

for i, (_, row) in enumerate(orig_df.iterrows()):
    if i < orig_train_size:
        # train
        target_dir: Path = final_train_dir
        target_idx: int = final_train_df.shape[0] + i
        traget_rows_list: list[pd.Series] = train_rows
    else:
        # val
        target_dir: Path = final_val_dir
        target_idx: int = final_val_df.shape[0] + (i - orig_train_size)
        traget_rows_list: list[pd.Series] = val_rows
    
    # copy image
    src_img_path: Path = orig_img_dir / row["image"]
    dst_name: str = f"{target_idx}.png"
    dst_img_path: Path = target_dir / dst_name
    shutil.copy(src_img_path, dst_img_path)
    
    # add row with info
    new_row: pd.Series = row.copy()
    new_row["image"] = dst_name
    traget_rows_list.append(new_row)


orig_train_df: pd.DataFrame = pd.DataFrame(data=train_rows).reset_index(drop=True)
orig_val_df: pd.DataFrame = pd.DataFrame(data=val_rows).reset_index(drop=True)
print(orig_train_df.shape, orig_val_df.shape)

(640, 9) (114, 9)


In [7]:
orig_train_df.head()

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,image
0,28500,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,0.png
1,3000,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,1.png
2,5000,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,2.png
3,5000,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,3.png
4,2500,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,4.png


In [8]:
orig_val_df.head()

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,image
0,1275,Denise Green,Both And,1979,Signed Denise Green on bottom right in pencil,This work is in good condition.Not examined ou...,Contemporary,Abstract,0.png
1,1275,Orly Genger,Untitled,2004,[nan],Not examined out of frame.No obvious signs of ...,Contemporary,Abstract,1.png
2,800,Joe Tilson,Signatures,1988,Tilson 1988 in pencil on bottom right recto.,This work is in very good condition.Artwork no...,Post-War,Conceptual,2.png
3,680,Tony Jannetti,The Third Emergence,1971,Signed in pencil Tony Jannetti '71 lower right...,This work is in very good condition.Not examin...,Modern,Abstract,3.png
4,1275,Michael Kenny,Working Drawing 1,1987,Michael Kenny 87 Working Drawing on top left i...,This work is in good condition.Not examined ou...,Contemporary,Abstract,4.png


In [9]:
# add data
final_train_df = pd.concat([final_train_df, orig_train_df])
final_val_df = pd.concat([final_val_df, orig_val_df])
print(final_train_df.shape, final_val_df.shape)

(640, 9) (114, 9)


In [10]:
final_train_df.head()

Unnamed: 0,price,artist,title,yearCreation,signed,condition,period,movement,image
0,28500,Tommaso Ottieri,Bayreuth Opera,2021,Signed on verso,This work is in excellent condition.,Contemporary,Baroque,0.png
1,3000,Pavel Tchelitchew,Drawings of the Opera,First Half 20th Century,Signed and titled,Not examined out of frame.No obvious signs of ...,Post-War,Surrealism,1.png
2,5000,Leo Gabin,Two on Sidewalk,2016,"Signed, titled and dated on verso",This work is in excellent condition.,Contemporary,Abstract,2.png
3,5000,Matthias Dornfeld,Blumenszene,2010,"Signed, titled and dated on the reverse with t...",This work is in excellent condition.There is m...,Contemporary,Abstract,3.png
4,2500,Alexis Marguerite Teplin,Feverish Embarkation,2001,Signed on verso,This work is in excellent condition.,Contemporary,Abstract,4.png


# Save labels

In [11]:
final_train_df.to_csv(final_train_df_path, index=False, encoding="utf-8")
final_val_df.to_csv(final_val_df_path, index=False, encoding="utf-8")