In [8]:
#package imports
import os
import pandas as pd
import numpy as np
from xml.dom import minidom
import shutil
import sys
from datetime import datetime

#start validation:

#find input folder
def findinput():
    validation=True
    if os.path.isdir('input') != True:
        validation=False
    else:
        print('The input folder has been found.')
    if validation==False:
        sys.exit('Error: The input folder cannot be found. Ensure it is in the '+os.getcwd()+' directory.')
        

#find csv
def findcsv():
    validation=True
    os.chdir('input')
    if os.path.isfile('metadata.csv') != True:
        validation=False
    else:
        print('metadata.csv has been found.')
    os.chdir('..')
    if validation==False:
        sys.exit('Error: The input folder cannot be found. Ensure it is in the input directory.')    
        
        
#output folder already exists
def existoutput():  
    validation=True
    os.chdir('input')
    if os.path.isdir('output') != False:
        validation=False
    else:
        print('The output directory does not exist. This is correct.')
    os.chdir('..')
    if validation==False:
        sys.exit('Error: An output directory already exists. Please delete it and run the script again.')

#outputzip already exists
def existoutputzip():  
    validation=True
    os.chdir('input')
    if os.path.isfile('output.zip') != False:
        validation=False
    else:
        print('The output zip file does not exist. This is correct.')
    os.chdir('..')
    if validation==False:
        sys.exit('Error: An output zip file already exists. Please delete it and run the script again.')

#check the csv headers for correctness

def headervalidation():
    validation=True
    correctheaders=['filename',
     'dc.title',
     'dc.type',
     'dc.publisher',
     'dc.language.iso',
     'dc.description.abstract',
     'dc.date.issued',
     'dc.subject',
     'description']

    headervalues=[]

    for header in correctheaders:
        if header in list(df.columns):
            headervalues.append(True)
        else:
            headervalues.append(False)

    if False in headervalues:
        validation=False
    else:
        print('The Metadata CSV has the correct headers.')
    
    if validation==False:
        sys.exit('Error The Metadata CSV does not have the correct headers. Please use these headers: filename, dc.title, dc.type, dc.publisher, dc.language.iso, dc.description.abstract, dc.date.issued, dc.subject, description.')

#check csv for unacceptable blanks

def blankvalidation():
    validation=True
    blankvalues=[]

    for blanks, row in df.iterrows():
        if type(row['dc.title'])!=float:
            blankvalues.append(True)
        else:
            blankvalues.append(False)
        if type(row['filename'])!=float:
            blankvalues.append(True)
        else:
            blankvalues.append(False)
            
    if False in blankvalues:
        validation=False
    else:
        print('The Metadata CSV has no blank values in required fields.')        
    if validation==False:
        sys.exit('Error: The Metadata CSV does has blank values for dc.title or filename. These fields are required.')

#check date format
def datevalidation():
    validation=True
    datevalues=[]
    
    for date, row in df.iterrows():
        if type(row['dc.date.issued'])!=float:
            if row['dc.date.issued']!=datetime.strptime(row['dc.date.issued'], "%Y-%m-%d").strftime('%Y-%m-%d'):
                datevalues.append(False)
            else:
                datevalues.append(True)

    if False in datevalues:
        validation=False
    else:
        print('The Metadata CSV has correct date formatting.')      
    if validation==False:
        sys.exit('Error: The Metadata CSV has incorrect date formatting.')

#item folder already exists
def itemexist():
    validation=True
    itemexistvalues=[]
    
    for item, row in df.iterrows():
            if os.path.isdir(row['dc.title'])!= False:
                itemexistvalues.append(False)
                print('A folder with the title of an item exists. Please delete it and run the script again. The offending folder is '+row['dc.title'])
            else:
                itemexistvalues.append(True)
                
    if False in itemexistvalues:
        validation=False
    else:
        print('No folder with the title of an item exists. This is correct.')
    if validation==False:
        sys.exit('Error: Item folder nonexistence failed. See above for offending file')

    
#file not found

def filenotfound():
    validation=True
    filevalues=[]
    
    for file, row in df.iterrows():
        if os.path.isfile(row['filename'])!=True:
            print('Error: File not found. Please ensure the '+row['filename']+' is in the '+os.getcwd()+' directory.')
            filevalues.append(False)
        else:
            filevalues.append(True)
    
    if False in filevalues:
        validation=False
    else:
        print('All Items Found')
    if validation==False:
        sys.exit('Error: Item not found. See above for offending file')

#actually run the validation(first stage)        
findinput()
findcsv()
existoutput()
existoutputzip()        
        
#enter the input folder
os.chdir('input')

#read the csv
df = pd.read_csv(r"metadata.csv")

#make the temporary output folder
os.mkdir('output')

#Actually run the validation(second stage)
itemexist()
filenotfound()
headervalidation()
blankvalidation()
datevalidation()

print('Validation has passed.')

#XML Tag Blocks

def titletag():
    #create the title tag
    if type(row['dc.title'])!=float:
        dctitle = root.createElement('dcvalue')
        dctitle.setAttribute('element', 'title')
        dctitle.setAttribute('qualifier', 'none')
        text=root.createTextNode(row['dc.title'])
        dctitle.appendChild(text)
        xml.appendChild(dctitle)

def typetag():
    #create the type tag
    if type(row['dc.type'])!=float:
        dctype = root.createElement('dcvalue')
        dctype.setAttribute('element', 'type')
        dctype.setAttribute('qualifier', 'none')
        text=root.createTextNode(row['dc.type'])
        dctype.appendChild(text)
        xml.appendChild(dctype)
 
def subjecttag():
    #create the subject tag (woooooooo!!!!!)
    if type(row['dc.subject'])!=float:
        subject=row['dc.subject'].split('#')
        for s in subject:
            dcsubject = root.createElement('dcvalue')
            dcsubject.setAttribute('element', 'subject')
            dcsubject.setAttribute('qualifier', 'none')
            text=root.createTextNode(s)
            dcsubject.appendChild(text)
            xml.appendChild(dcsubject)

def publishertag():
    #create the publisher tag
    if type(row['dc.publisher'])!=float:
        dcpublisher = root.createElement('dcvalue')
        dcpublisher.setAttribute('element', 'publisher')
        dcpublisher.setAttribute('qualifier', 'none')
        text=root.createTextNode(row['dc.publisher'])
        dcpublisher.appendChild(text)
        xml.appendChild(dcpublisher)

def languagetag():
    #create the language tag
    if type(row['dc.language.iso'])!=float:
        dclanguage = root.createElement('dcvalue')
        dclanguage.setAttribute('element', 'language')
        dclanguage.setAttribute('qualifier', 'iso')
        text=root.createTextNode(row['dc.language.iso'])
        dclanguage.appendChild(text)
        xml.appendChild(dclanguage)

def descriptiontag():    
    #create the description tag
    if type(row['dc.description.abstract'])!=float:
        dcdescription = root.createElement('dcvalue')
        dcdescription.setAttribute('element', 'description')
        dcdescription.setAttribute('qualifier', 'abstract')
        text=root.createTextNode(row['dc.description.abstract'])
        dcdescription.appendChild(text)
        xml.appendChild(dcdescription)

def datetag():        
    #create the date tag
    if type(row['dc.date.issued'])!=float:
        dcdate = root.createElement('dcvalue')
        dcdate.setAttribute('element', 'date')
        dcdate.setAttribute('qualifier', 'issued')
        text=root.createTextNode(row['dc.date.issued'])
        dcdate.appendChild(text)
        xml.appendChild(dcdate)

#Iterate through the metada csv
for n, row in df.iterrows():

    #Make directory for each item
    os.mkdir(row['dc.title'])
    print(row['dc.title']+' folder created')
    
    #copy file into item's directory
    shutil.copy(row['filename'], os.path.join(row['dc.title'], row['filename']))
    print(row['dc.title']+' copied into folder')
    
    #enter the item directory
    os.chdir(row['dc.title'])
    
    #create the xml file
    root = minidom.Document()    
    
    #create the dublin core schema tag
    xml = root.createElement('dublin_core') 
    root.appendChild(xml)
    
    #Run the tag blocks
    titletag()
    typetag()
    subjecttag()
    publishertag()
    languagetag()
    descriptiontag()
    datetag()
    
    #make the xml pretty (for troubleshooting purposes)
    xml_str = root.toprettyxml(indent ="\t") 
    
    #save the xml file
    save_path_file = ("dublin_core.xml")
    with open(save_path_file, "w") as f:
        f.write(xml_str)
    print(row['dc.title']+' XML file created')
    
    #Create the contents file. 
    f = open(("contents"), "a")
    if type(row['description'])!=float:
        f.write(row['filename']+'\t'+'description:'+row['description'])
    else:
        f.write(row['filename']+'\t'+'bundle:ORIGINAL')
    f.close()
    print(row['dc.title']+' contents file created')
    
    #Return to the inputs level
    os.chdir('..')
    
    #move the completed item folders to output
    shutil.move(row['dc.title'], os.path.join('output',row['dc.title']))
    print(row['dc.title']+' folder moved to output folder')

#Make the zip file 
print('Generating zip file')
shutil.make_archive('output', 'zip', 'output')
print('Zip file created')

#Remove the temporary output file (commmented out for testing purposes)
#shutil.rmtree('output')
#print('Output folder deleted')

#return to the original level
os.chdir('..')

#Print operation successful
print('Operation successful. Please check your input folder for the zip file')


The input folder has been found.
metadata.csv has been found.


SystemExit: Error: An output directory already exists. Please delete it and run the script again.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
