In [3]:
pip install swifter

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import ast
import re
import xml.etree.ElementTree as ET
import swifter
import numpy as np
from collections import Counter
import os
import shutil

In [13]:
columns = ['id', 'anumber', 'sort', 'inventorynumber', 'artists', 'publishers',
           'sheetsize', 'platesize', 'imagesize', 'technique', 'notes',
           'description', 'catalogs', 'condition', 'source', 'instime', 'modtime']

with open("crpd_port.txt") as f:
    lines = f.readlines()
    # Clean and convert each line to a tuple
    data_rows = [ast.literal_eval(line.strip().rstrip(',').rstrip(";")) for line in lines[1:] if line.strip()]

# Create DataFrame
df = pd.DataFrame(data_rows, columns=columns)

df.head()

Unnamed: 0,id,anumber,sort,inventorynumber,artists,publishers,sheetsize,platesize,imagesize,technique,notes,description,catalogs,condition,source,instime,modtime
0,1,A 891,891,I 625,"<artists>\n<artist>\n <name>Delpech, Franço...",,272 x 178,,ca. 80 x 85,Lithographie: I[mprimerie] de [François-Séraph...,Faksimile (Namenszug),<description>\n<p>Brustb. etw. nach l. vor ang...,,,,799564606,1134120142
1,2,A 40,40,I 34,,,268 x 160,,ca. 114 x 116,Lithographie: ohne Adresse.,,<description>\n<p>Darstellung ähnlich wie A 39...,Drugulin 39?\nDrugulin Th.17?\nVgl. Flatz 11.,,,794161891,1078156531
2,3,A 850,850,II 205,"<artists>\n<artist>\n <name>H., v.</name>\n...",,380 x 273,,193 x 165,Lithographie: <u.r.> v.H.,,"<description>\n<p>Rechteck. Brustb. nach hl., ...",,mit breitem Rand,,799146581,1188900562
3,4,A 2255,2255,I 1558.1,"<artists>\n<artist>\n <name>Laufer, C.</nam...",,95 x 69,,ca. 69 x 54,Holzstich: <u.l.> C. Lauffer sc.,Holzstich\nFaksimile (Namenszug),"<description>\n<p>Brustb. nach r., in der Link...",,ausgeschn.,,815225834,1143549630
4,5,A 1738,1738,II 448,"<artists>\n<artist>\n <name>Oldenburg, Hein...","<publishers>\n<publisher>\n<name>Wehrt, August...",362 x 278,,ca. 180 x 165,Lithographie: <im Bild u.r.> H.O. 1830. 2 – <u...,,<description>\n<p>Brustb. nach hr. in unten an...,,,,805205419,1188234052


In [14]:
def extract_artist_fields(xml):
    name = re.search(r'<name>(.*?)</name>', xml)
    role = re.search(r'<role>(.*?)</role>', xml)
    lifetime = re.search(r'<lifetime>(.*?)</lifetime>', xml)
    return pd.Series([
        name.group(1) if name else "Others",
        role.group(1) if role else None,
        lifetime.group(1) if lifetime else None
    ])

df[['artist_name', 'role', 'lifetime']] = df['artists'].swifter.apply(extract_artist_fields)

Pandas Apply: 100%|██████████| 28633/28633 [00:01<00:00, 14532.96it/s]


In [15]:
x = df['artist_name'].unique()
art_count = Counter(df['artist_name'])

In [16]:
art_count_df = pd.DataFrame.from_dict(art_count,orient = "index",columns=['count'])
art_count_df.reset_index(inplace=True)
art_count_df.rename(columns={"index":"artist_name"}, inplace=True)

In [20]:
adlft = df[['artist_name','lifetime']].drop_duplicates().dropna().sort_values(by='lifetime')
final_df = pd.merge(art_count_df,adlft,on="artist_name")
final_df.sort_values(by="count",ascending=False).to_csv("artist_count.csv", index=False)

In [82]:
df.head()

Unnamed: 0,id,anumber,sort,inventorynumber,artists,publishers,sheetsize,platesize,imagesize,technique,notes,description,catalogs,condition,source,instime,modtime,artist_name,role,lifetime
0,1,A 891,891,I 625,"<artists>\n<artist>\n <name>Delpech, Franço...",,272 x 178,,ca. 80 x 85,Lithographie: I[mprimerie] de [François-Séraph...,Faksimile (Namenszug),<description>\n<p>Brustb. etw. nach l. vor ang...,,,,799564606,1134120142,"Delpech, François-Séraphin",Stecher,1778–1825
1,2,A 40,40,I 34,,,268 x 160,,ca. 114 x 116,Lithographie: ohne Adresse.,,<description>\n<p>Darstellung ähnlich wie A 39...,Drugulin 39?\nDrugulin Th.17?\nVgl. Flatz 11.,,,794161891,1078156531,Others,,
2,3,A 850,850,II 205,"<artists>\n<artist>\n <name>H., v.</name>\n...",,380 x 273,,193 x 165,Lithographie: <u.r.> v.H.,,"<description>\n<p>Rechteck. Brustb. nach hl., ...",,mit breitem Rand,,799146581,1188900562,"H., v.",Stecher,
3,4,A 2255,2255,I 1558.1,"<artists>\n<artist>\n <name>Laufer, C.</nam...",,95 x 69,,ca. 69 x 54,Holzstich: <u.l.> C. Lauffer sc.,Holzstich\nFaksimile (Namenszug),"<description>\n<p>Brustb. nach r., in der Link...",,ausgeschn.,,815225834,1143549630,"Laufer, C.",Stecher,
4,5,A 1738,1738,II 448,"<artists>\n<artist>\n <name>Oldenburg, Hein...","<publishers>\n<publisher>\n<name>Wehrt, August...",362 x 278,,ca. 180 x 165,Lithographie: <im Bild u.r.> H.O. 1830. 2 – <u...,,<description>\n<p>Brustb. nach hr. in unten an...,,,,805205419,1188234052,"Oldenburg, Heinrich",Stecher,


In [41]:
##now shortlist the original dataset by the selected authors names
"""filtered_df = df[df["artist_name"].isin(["Stimmer, Tobias","Kilian, Lucas", "Aubry, Peter II",
                       "Kilian, Wolfgang Philipp","Fennitzer, Georg",
                       "Romstet, Christian","Bernigeroth, Martin",
                       "Sysang, J. Christoph", "Bock, Christoph Wilhelm",
                       "Bernigeroth, J. Martin", "Mentzel, J. Georg",
                       "Riepenhausen, E. Ludwig","Graff, Anton"])]
"""
##New dataset where the artists have been binned into temporal bins
filtered_df_16th_century = df[df["artist_name"].isin([
    # 16th century
    "Stimmer, Tobias", ## ok
    "Cranach, Lucas d.Ä.", ## ok
    "Hondius, Hendrick d.Ä.", ## ok
    "Passe, Crispijn d.Ä. van de", ## ok
])]

filtered_df_17th_century = df[df["artist_name"].isin([
    # 17th century
    "Kilian, Wolfgang Philipp",
    "Kilian, Wolfgang",
    "Furck, Sebastian",
    "Troschel, Peter",
    "Aubry, Peter II",
    "Somer, Matthias van",
    "Bernigeroth, Martin",
    "Kilian, Lucas",
    "Kupezky, Johann",
    "Heyden, Jacob van der",
    "Rigaud, Hyacinthe",
    "Kneller, Godfrey",
    "Strauch, Georg",
    "Custos, Dominicus",
    "Meyer, Conrad",
    "Weigel, Christoph",
    "Moncornet, Baltazar",
    "Dyck, Anthonis van"])]

filtered_df_18th_century = df[df["artist_name"].isin([    
    # 18th century
    "Bodenehr, Moritz",
    "Bernigeroth, Johann Martin",
    "Haid, Johann Jacob",
    "Krügner, Johann Gottfried d.Ä.",
    "Mentzel, Johann Georg",
    "Leonart, Johann Friedrich",
    "Sysang, Johann Christoph",
    "Brühl, Johann Benjamin",
    "Böner, Johann Alexander",
    "Sandrart, Jacob von",
    "Bollinger, Friedrich Wilhelm",
    "Riepenhausen, Ernst Ludwig",
    "Hausmann, Elias Gottlob",
    "Preissler, Daniel",
    "Roth, Christoph Melchior",
    "Romstet, Christian",
    "Fritzsch, Christian",
    "Graff, Anton"])]

In [42]:
more_filter_16th = filtered_df_16th_century[['anumber',"artist_name","imagesize",]]
more_filter_16th['anumber'] = more_filter_16th['anumber'].str.replace(' ','_')

more_filter_17th = filtered_df_17th_century[['anumber',"artist_name","imagesize",]]
more_filter_17th['anumber'] = more_filter_17th['anumber'].str.replace(' ','_')

more_filter_18th = filtered_df_18th_century[['anumber',"artist_name","imagesize",]]
more_filter_18th['anumber'] = more_filter_18th['anumber'].str.replace(' ','_')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_filter_16th['anumber'] = more_filter_16th['anumber'].str.replace(' ','_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_filter_17th['anumber'] = more_filter_17th['anumber'].str.replace(' ','_')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  more_filter_18th['anumber'] = more_filter_18th[

In [44]:
uniq_art_name = more_filter_16th['artist_name'].unique()
for artname in uniq_art_name:
    try:
        directory_name = f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/PortraitDataset/16th_century_ogdataset"
        os.mkdir(directory_name)
    except FileExistsError:
        print(f"Directory {directory_name} already exists.")
    filename_shortlisted = more_filter_16th[more_filter_16th['artist_name']== str(artname)]["anumber"].values
    all_filename = os.listdir("/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024")
    print(f"Shortlisted {len(filename_shortlisted)} images for {artname}")
    for filename in filename_shortlisted:
        if f"{filename}.jpg" in all_filename:
            print(f"{filename} available")
            shutil.copy2(f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024/{filename}.jpg",directory_name)
        else:
            print(f"{filename} not available")

Shortlisted 175 images for Stimmer, Tobias
A_268 available
A_110 available
A_321 available
A_16539 available
A_115 available
A_404 available
A_203 available
A_1398 available
A_81 available
A_690 available
A_16540 available
A_466 available
A_16558 available
A_16559 available
A_2253 available
A_2821 available
A_3052 available
A_5143 available
A_2979 available
A_3668 available
A_3128 available
A_3318 available
A_4879 not available
A_3965 available
A_5144 available
A_4142 available
A_3144 available
A_3756 available
A_4193 available
A_2980 available
A_3557 available
A_2822 available
A_2770 available
A_4065 available
A_3317 available
A_5369 available
A_6030 available
A_6031 available
A_6029 available
A_6300 available
A_6593 available
A_7174 available
A_6528 available
A_7765 available
A_7691 available
A_7690 available
A_8490 available
A_7198 available
A_7199 available
A_8338 available
A_8114 available
A_6725 available
A_9617 available
A_10338 available
A_9616 available
A_10362 available
A_103

In [45]:
uniq_art_name = more_filter_17th['artist_name'].unique()
for artname in uniq_art_name:
    try:
        directory_name = f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/PortraitDataset/17th_century_ogdataset"
        os.mkdir(directory_name)
    except FileExistsError:
        print(f"Directory {directory_name} already exists.")
    filename_shortlisted = more_filter_17th[more_filter_17th['artist_name']== str(artname)]["anumber"].values
    all_filename = os.listdir("/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024")
    print(f"Shortlisted {len(filename_shortlisted)} images for {artname}")
    for filename in filename_shortlisted:
        if f"{filename}.jpg" in all_filename:
            print(f"{filename} available")
            shutil.copy2(f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024/{filename}.jpg",directory_name)
        else:
            print(f"{filename} not available")

Shortlisted 870 images for Bernigeroth, Martin
A_936 available
A_189 available
A_245 available
A_297 available
A_296 available
A_450 available
A_744 available
A_53 available
A_393 available
A_1051 available
A_395 available
A_419 available
A_411 available
A_422 available
A_447 available
A_937 available
A_638 available
A_610 available
A_16621 not available
A_2716 available
A_803 available
A_1820 available
A_190 available
A_830 available
A_2287 available
A_974 available
A_1053 available
A_997 available
A_1045 available
A_1842 available
A_1276 available
A_16451 available
A_1410 available
A_448 available
A_2733 available
A_1695 available
A_1529 available
A_1824 available
A_1586 available
A_1780 available
A_1880 available
A_1691 available
A_2300 available
A_1777 available
A_1817 available
A_1938 available
A_970 not available
A_1947 available
A_2036 available
A_1956 available
A_398 available
A_967 available
A_485 available
A_814 available
A_2215 available
A_2401 available
A_1256 available
A_2

In [46]:
uniq_art_name = more_filter_18th['artist_name'].unique()
for artname in uniq_art_name:
    try:
        directory_name = f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/PortraitDataset/18th_century_ogdataset"
        os.mkdir(directory_name)
    except FileExistsError:
        print(f"Directory {directory_name} already exists.")
    filename_shortlisted = more_filter_18th[more_filter_18th['artist_name']== str(artname)]["anumber"].values
    all_filename = os.listdir("/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024")
    print(f"Shortlisted {len(filename_shortlisted)} images for {artname}")
    for filename in filename_shortlisted:
        if f"{filename}.jpg" in all_filename:
            print(f"{filename} available")
            shutil.copy2(f"/data/vifapi/ameed_ahmed_thesis/HildesheimThesis/CSWin_Transformer_main/HAB_dataset/cropped_2024/{filename}.jpg",directory_name)
        else:
            print(f"{filename} not available")

Shortlisted 382 images for Bernigeroth, Johann Martin
A_130 available
A_127 available
A_129 available
A_185 available
A_1333 available
A_521 available
A_976 available
A_670 available
A_1031 available
A_695 available
A_852 available
A_881 available
A_770 available
A_774 available
A_2589 available
A_1032 available
A_1026 available
A_1376 available
A_1335 available
A_1402 available
A_2618 available
A_1422 available
A_1617 available
A_1791 available
A_1593 available
A_1299 available
A_2277 not available
A_2703 available
A_2166 available
A_1604 available
A_2045 available
A_23 available
A_1628 available
A_61 available
A_626 available
A_728 available
A_1315 available
A_1419 available
A_1427 available
A_410 available
A_712 available
A_561 available
A_1123 available
A_1510 available
A_16448 available
A_16488 available
A_16498 available
A_16565 available
A_16626 available
A_165 available
A_2301 available
A_510 available
A_1108 available
A_3245 available
A_4273 not available
A_2911 available
A_28