In [None]:
"""
This notebook parses the metadata about the Heinrich Zille photo collection into a csv file.

Originally the the metadata file is a XML file in BIG LIDO format (which is a metadata schema used in museums).
It gets here first parsed into JSON and then parsed into a list of images with metadata,
which eventually build the rows in the final csv file.
"""

In [1]:
import smbclient
import pandas as pd
import xmltodict

In [2]:
server_address = '192.168.178.35'

# todo: make credentials file to import
user_name = ''
pw = ''

# Optional - register the server with explicit credentials
smbclient.register_session(server_address, username=user_name, password=pw)

<smbprotocol.session.Session at 0x10be784f0>

In [3]:
meta_data_address = f'{server_address}/DataCStore/HeinrichZille/Metadaten/BG_LIDO_Zille_CdV_20170816.xml'
with smbclient.open_file(meta_data_address, 'r') as meta_data_file:
    xml_as_dict = xmltodict.parse(meta_data_file.read())

In [4]:
def list_get (a_list, index, fallback_val=None):
    try:
        return a_list[index]
    except IndexError:
        return fallback_val

def parse_single_image(single_image_meta_data):
    image_id = single_image_meta_data['lido:administrativeMetadata']['lido:recordWrap']['lido:recordID']['#text']
    title = single_image_meta_data['lido:descriptiveMetadata']['lido:objectIdentificationWrap']['lido:titleWrap']['lido:titleSet']['lido:appellationValue']
    date = single_image_meta_data['lido:descriptiveMetadata']['lido:eventWrap']['lido:eventSet']['lido:event']['lido:eventDate']['lido:displayDate']
    author = single_image_meta_data['lido:descriptiveMetadata']['lido:eventWrap']['lido:eventSet']['lido:event']['lido:eventActor']['lido:displayActorInRole']
    technique = single_image_meta_data['lido:descriptiveMetadata']['lido:eventWrap']['lido:eventSet']['lido:event']['lido:eventMaterialsTech']['lido:displayMaterialsTech']

    measurement_set = single_image_meta_data['lido:descriptiveMetadata']['lido:objectIdentificationWrap']['lido:objectMeasurementsWrap']['lido:objectMeasurementsSet']

    measurements = [measurement['lido:displayObjectMeasurements'] for measurement in measurement_set]


    parsed_image_meta_data = {
        'id': image_id,
        'title': title,
        'date': date,
        'author': author,
        'technique': technique
    }

    for i in range(5):
        parsed_image_meta_data[f'measurement_{i + 1}'] = list_get(measurements, i)

    return parsed_image_meta_data


image_meta_data_list = xml_as_dict['lido:lidoWrap']['lido:lido']
print(f'{len(image_meta_data_list)} images')

parsed_images = [parse_single_image(single_image_meta_data) for single_image_meta_data in image_meta_data_list]
print(len(parsed_images))

624 images
624


In [5]:
df = pd.DataFrame(parsed_images)
print(df)

         id                                              title  \
0    153911         Ohne Titel (Zuschauer vor einer Schaubude)   
1    156704              Ohne Titel (Blick aus Zilles Wohnung)   
2    156721  Ohne Titel (Das Haus Nr. 88 in der Sophie-Char...   
3    156722  Ohne Titel (Fritz Klimsch in seinem Atelier, r...   
4    156723  Ohne Titel (In der Wohnung, Blick vom Wohn- zu...   
..      ...                                                ...   
619  228778       Ohne Titel (Mädchen mit Kleinkind und Junge)   
620  228780  Ohne Titel (Kegelabend, vorn rechts August Kla...   
621  228782               Ohne Titel (Zuschauer vor Schaubude)   
622  228792              Ohne Titel (Portrait Margarete Zille)   
623  228793       Ohne Titel (Portrait Hulda Zille, Brustbild)   

                 date                       author             technique  \
0         Sommer 1897  Zille, Heinrich (1858-1929)  Silbergelatinepapier   
1         Sommer 1893  Zille, Heinrich (1858-1929)  Sil

In [6]:
df.to_csv('parsed_image_meta_data.csv')