From ba1be711bd6b947ff8eed7705ea7721e4c5e58d4 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Tue, 24 Sep 2024 01:23:48 +0545 Subject: [PATCH 1/9] Update cli.py --- caltechdata_api/cli.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py index 4fbdea9..ed4a7a6 100644 --- a/caltechdata_api/cli.py +++ b/caltechdata_api/cli.py @@ -60,10 +60,13 @@ def decrypt_token(encrypted_token, key): # Function to get or set token -def get_or_set_token(): - +def get_or_set_token(production=True): key = load_or_generate_key() - token_file = os.path.join(caltechdata_directory, "token.txt") + + # Use different token files for production and test environments + token_filename = "token.txt" if production else "token_test.txt" + token_file = os.path.join(caltechdata_directory, token_filename) + try: with open(token_file, "rb") as f: encrypted_token = f.read() @@ -71,8 +74,8 @@ def get_or_set_token(): return token except FileNotFoundError: while True: - token = input("Enter your CaltechDATA token: ").strip() - confirm_token = input("Confirm your CaltechDATA token: ").strip() + token = input(f"Enter your {'Production' if production else 'Test'} CaltechDATA token: ").strip() + confirm_token = input(f"Confirm your {'Production' if production else 'Test'} CaltechDATA token: ").strip() if token == confirm_token: encrypted_token = encrypt_token(token, key) with open(token_file, "wb") as f: @@ -403,7 +406,7 @@ def main(): def create_record(production): - token = get_or_set_token() + token = get_or_set_token(production) print("Using CaltechDATA token:", token) while True: choice = get_user_input( @@ -526,7 +529,7 @@ def print_upload_message(rec_id, production): def edit_record(production): record_id = input("Enter the CaltechDATA record ID: ") - token = get_or_set_token() + token = get_or_set_token(production) file_name = download_file_by_id(record_id, token) if file_name: From 1bff778de59dbdd834fb7aa948b8339d09576340 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Tue, 24 Sep 2024 02:03:26 +0545 Subject: [PATCH 2/9] Update caltechdata_write.py --- caltechdata_api/caltechdata_write.py | 36 +++++++++++----------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py index 68a1da9..d14d80d 100644 --- a/caltechdata_api/caltechdata_write.py +++ b/caltechdata_api/caltechdata_write.py @@ -1,7 +1,7 @@ import copy import json -import os, requests - +import os +import requests import s3fs from requests import session from json.decoder import JSONDecodeError @@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal infile = open(name, "rb") else: infile = open(f_list[name], "rb") - # size = infile.seek(0, 2) - # infile.seek(0, 0) # reset at beginning result = requests.put(link, headers=f_headers, data=infile) if result.status_code != 200: raise Exception(result.text) @@ -65,10 +63,11 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal raise Exception(result.text) + def add_file_links( metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None ): - # Currently configured for S3 links, assuming all are at same endpoint + # Currently configured for S3 links, assuming all are at the same endpoint link_string = "" endpoint = "https://" + file_links[0].split("/")[2] s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) @@ -152,13 +151,8 @@ def caltechdata_write( s3_link=None, default_preview=None, review_message=None, + keep_file=False, # New parameter ): - """ - File links are links to files existing in external systems that will - be added directly in a CaltechDATA record, instead of uploading the file. - - S3 is a s3sf object for directly opening files - """ # Make a copy so that none of our changes leak out metadata = copy.deepcopy(metadata) @@ -167,7 +161,7 @@ def caltechdata_write( token = os.environ["RDMTOK"] # If files is a string - change to single value array - if isinstance(files, str) == True: + if isinstance(files, str): files = [files] if file_links: @@ -176,14 +170,13 @@ def caltechdata_write( ) # Pull out pid information - if production == True: + if production: repo_prefix = "10.22002" else: repo_prefix = "10.33569" pids = {} identifiers = [] if "metadata" in metadata: - # we have rdm schema if "identifiers" in metadata["metadata"]: identifiers = metadata["metadata"]["identifiers"] elif "identifiers" in metadata: @@ -200,11 +193,10 @@ def caltechdata_write( "provider": "oai", } elif "scheme" in identifier: - # We have RDM internal metadata if identifier["scheme"] == "doi": doi = identifier["identifier"] prefix = doi.split("/")[0] - if doi != False: + if doi: if prefix == repo_prefix: pids["doi"] = { "identifier": doi, @@ -220,25 +212,25 @@ def caltechdata_write( if "pids" not in metadata: metadata["pids"] = pids - if authors == False: + if not authors: data = customize_schema.customize_schema(metadata, schema=schema) - if production == True: + if production: url = "https://data.caltech.edu/" else: url = "https://data.caltechlibrary.dev/" else: data = metadata - if production == True: + if production: url = "https://authors.library.caltech.edu/" else: url = "https://authors.caltechlibrary.dev/" headers = { - "Authorization": "Bearer %s" % token, + "Authorization": f"Bearer {token}", "Content-type": "application/json", } f_headers = { - "Authorization": "Bearer %s" % token, + "Authorization": f"Bearer {token}", "Content-type": "application/octet-stream", } @@ -256,7 +248,7 @@ def caltechdata_write( if files: file_link = result.json()["links"]["files"] - write_files_rdm(files, file_link, headers, f_headers, s3) + write_files_rdm(files, file_link, headers, f_headers, s3, keep_file) if community: review_link = result.json()["links"]["review"] From a31d86f1c6c67648ff8aca705e1e6f697470777d Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Tue, 24 Sep 2024 22:11:46 +0545 Subject: [PATCH 3/9] Update caltechdata_write.py --- caltechdata_api/caltechdata_write.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py index d14d80d..1b25f80 100644 --- a/caltechdata_api/caltechdata_write.py +++ b/caltechdata_api/caltechdata_write.py @@ -151,8 +151,13 @@ def caltechdata_write( s3_link=None, default_preview=None, review_message=None, - keep_file=False, # New parameter ): + """ + File links are links to files existing in external systems that will + be added directly in a CaltechDATA record, instead of uploading the file. + + S3 is a s3sf object for directly opening files + """ # Make a copy so that none of our changes leak out metadata = copy.deepcopy(metadata) @@ -161,7 +166,7 @@ def caltechdata_write( token = os.environ["RDMTOK"] # If files is a string - change to single value array - if isinstance(files, str): + if isinstance(files, str) == True: files = [files] if file_links: @@ -170,13 +175,14 @@ def caltechdata_write( ) # Pull out pid information - if production: + if production == True: repo_prefix = "10.22002" else: repo_prefix = "10.33569" pids = {} identifiers = [] if "metadata" in metadata: + # we have rdm schema if "identifiers" in metadata["metadata"]: identifiers = metadata["metadata"]["identifiers"] elif "identifiers" in metadata: @@ -193,10 +199,11 @@ def caltechdata_write( "provider": "oai", } elif "scheme" in identifier: + # We have RDM internal metadata if identifier["scheme"] == "doi": doi = identifier["identifier"] prefix = doi.split("/")[0] - if doi: + if doi != False: if prefix == repo_prefix: pids["doi"] = { "identifier": doi, @@ -212,25 +219,25 @@ def caltechdata_write( if "pids" not in metadata: metadata["pids"] = pids - if not authors: + if authors == False: data = customize_schema.customize_schema(metadata, schema=schema) - if production: + if production == True: url = "https://data.caltech.edu/" else: url = "https://data.caltechlibrary.dev/" else: data = metadata - if production: + if production == True: url = "https://authors.library.caltech.edu/" else: url = "https://authors.caltechlibrary.dev/" headers = { - "Authorization": f"Bearer {token}", + "Authorization": "Bearer %s" % token, "Content-type": "application/json", } f_headers = { - "Authorization": f"Bearer {token}", + "Authorization": "Bearer %s" % token, "Content-type": "application/octet-stream", } @@ -248,7 +255,7 @@ def caltechdata_write( if files: file_link = result.json()["links"]["files"] - write_files_rdm(files, file_link, headers, f_headers, s3, keep_file) + write_files_rdm(files, file_link, headers, f_headers, s3) if community: review_link = result.json()["links"]["review"] From 8b742ccf532c42f0df0dec22f1d91cf786131d58 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 01:22:42 +0545 Subject: [PATCH 4/9] Update cli.py --- caltechdata_api/cli.py | 115 +++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 49 deletions(-) diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py index ed4a7a6..e63db47 100644 --- a/caltechdata_api/cli.py +++ b/caltechdata_api/cli.py @@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key): return f.decrypt(encrypted_token).decode() -# Function to get or set token +# Function to get or set token with support for test system def get_or_set_token(production=True): key = load_or_generate_key() @@ -85,6 +85,7 @@ def get_or_set_token(production=True): print("Tokens do not match. Please try again.") + def welcome_message(): print("Welcome to CaltechDATA CLI") @@ -378,22 +379,22 @@ def upload_data_from_file(): except json.JSONDecodeError as e: print(f"Error: Invalid JSON format in the file '{filename}'. {str(e)}") - def parse_args(): """Parse command-line arguments.""" parser = argparse.ArgumentParser(description="CaltechDATA CLI tool.") parser.add_argument( - "-test", action="store_true", help="Use test mode, sets production to False" + "-test", + action="store_true", + help="Use test mode, sets production to False" ) args = parser.parse_args() return args - def main(): args = parse_args() - + production = not args.test # Set production to False if -test flag is provided - + choice = get_user_input( "Do you want to create or edit a CaltechDATA record? (create/edit): " ).lower() @@ -407,6 +408,7 @@ def main(): def create_record(production): token = get_or_set_token(production) + #keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes" print("Using CaltechDATA token:", token) while True: choice = get_user_input( @@ -418,11 +420,7 @@ def create_record(production): if existing_data: if filepath != "": response = caltechdata_write( - existing_data, - token, - filepath, - production=production, - publish=False, + existing_data, token, filepath, production=production, publish=False ) elif file_link != "": response = caltechdata_write( @@ -499,6 +497,7 @@ def create_record(production): metadata, token, production=production, publish=False ) rec_id = response + print_upload_message(rec_id, production) with open(response + ".json", "w") as file: @@ -509,29 +508,20 @@ def create_record(production): else: print("Invalid choice. Please enter 'existing' or 'create'.") - def print_upload_message(rec_id, production): - base_url = ( - "https://data.caltech.edu/uploads/" - if production - else "https://data.caltechlibrary.dev/uploads/" - ) + base_url = "https://data.caltech.edu/uploads/" if production else "https://data.caltechlibrary.dev/uploads/" print( - f""" - You can view and publish this record at - + f"""You can view and publish this record at {base_url}{rec_id} - - If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/` - """ + If you need to upload large files to S3, you can type + `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`""" ) - def edit_record(production): record_id = input("Enter the CaltechDATA record ID: ") token = get_or_set_token(production) file_name = download_file_by_id(record_id, token) - + if file_name: try: # Read the edited metadata file @@ -548,38 +538,51 @@ def edit_record(production): print(f"An error occurred during metadata editing: {e}") else: print("No metadata file found.") - choice = get_user_input("Do you want to add files? (y/n): ").lower() if choice == "y": if production: API_URL_TEMPLATE = "https://data.caltech.edu/api/records/{record_id}/files" - API_URL_TEMPLATE_DRAFT = ( - "https://data.caltech.edu/api/records/{record_id}/draft/files" - ) + API_URL_TEMPLATE_DRAFT = "https://data.caltech.edu/api/records/{record_id}/draft/files" else: - API_URL_TEMPLATE = ( - "https://data.caltechlibrary.dev/api/records/{record_id}/files" - ) - API_URL_TEMPLATE_DRAFT = ( - "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files" - ) - + API_URL_TEMPLATE = "https://data.caltechlibrary.dev/api/records/{record_id}/files" + API_URL_TEMPLATE_DRAFT = "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files" + url = API_URL_TEMPLATE.format(record_id=record_id) url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id) + + headers = { + "accept": "application/json", + } - response = requests.get(url) - response_draft = requests.get(url_draft) + if token: + headers["Authorization"] = "Bearer %s" % token - filepath, file_link = upload_supporting_file(record_id) - print(file_link) + response = requests.get(url, headers=headers) + response_draft = requests.get(url_draft, headers=headers) + + #print(production, response, response_draft) + #print(response.status_code, response_draft.status_code) - if response.status_code == 404 and response_draft.status_code == 404: + data = response.json() + data_draft = response_draft.json() + + #print(data_draft) + # Check if 'entries' exists and its length + if len(data.get('entries', [])) == 0 and len(data_draft.get('entries', [])) == 0: keepfile = False else: - keepfile = ( - input("Do you want to keep existing files? (y/n): ").lower() == "y" - ) - + keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y" + + # if response.status_code == 404 and response_draft.status_code == 404: + # keepfile = False + # else: + + # keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y" + + filepath, file_link = upload_supporting_file(record_id) + if file_link: + print(file_link) + if filepath != "": response = caltechdata_edit( record_id, @@ -599,9 +602,12 @@ def edit_record(production): publish=False, keepfile=keepfile, ) - + rec_id = response print_upload_message(rec_id, production) + + + def download_file_by_id(record_id, token=None): @@ -616,15 +622,26 @@ def download_file_by_id(record_id, token=None): try: response = requests.get(url, headers=headers) - if response.status_code != 200: # Might have a draft response = requests.get( url + "/draft", headers=headers, ) - if response.status_code != 200: - raise Exception(f"Record {record_id} does not exist, cannot edit") + if response.status_code != 200: + url = f"https://data.caltechlibrary.dev/api/records/{record_id}" + response = requests.get( + url, + headers=headers, + ) + if response.status_code != 200: + # Might have a draft + response = requests.get( + url + "/draft", + headers=headers, + ) + if response.status_code != 200: + raise Exception(f"Record {record_id} does not exist, cannot edit") file_content = response.content file_name = f"downloaded_data_{record_id}.json" with open(file_name, "wb") as file: From bf5ca3c98b93247e497637c0cd1b665995d7262d Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 02:14:25 +0545 Subject: [PATCH 5/9] Update customize_schema.py --- caltechdata_api/customize_schema.py | 126 +++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 1 deletion(-) diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py index c379e58..1e7e18c 100644 --- a/caltechdata_api/customize_schema.py +++ b/caltechdata_api/customize_schema.py @@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles): def customize_schema_rdm(json_record): # Get vocabularies used in InvenioRDM + vocabularies = get_vocabularies() - + validate_metadata(json_record) peopleroles = vocabularies["crr"] resourcetypes = vocabularies["rsrct"] descriptiontypes = vocabularies["dty"] @@ -385,6 +386,129 @@ def customize_schema_rdm(json_record): return final +def validate_metadata(json_record): + """ + Validates the presence and structure of required fields in a CaltechDATA JSON record. + Raises an exception if any required field is missing or structured incorrectly. + """ + errors = [] + + # Check for 'types' and 'resourceTypeGeneral' + if 'types' not in json_record: + errors.append("'types' field is missing.") + elif not isinstance(json_record['types'], dict): + errors.append("'types' field should be a dictionary.") + elif 'resourceTypeGeneral' not in json_record['types']: + errors.append("'resourceTypeGeneral' field is missing in 'types'.") + + # Check for 'title' + if 'titles' not in json_record: + errors.append("'titles' field is missing.") + elif not isinstance(json_record['titles'], list) or len(json_record['titles']) == 0: + errors.append("'titles' should be a non-empty list.") + else: + # Ensure each title is a dictionary with 'title' field + for title in json_record['titles']: + if not isinstance(title, dict) or 'title' not in title: + errors.append("Each entry in 'titles' must be a dictionary with a 'title' key.") + + # Check for 'publication_date' + if 'publicationYear' not in json_record and 'dates' not in json_record: + errors.append("A publication date is required ('publicationYear' or 'dates' field is missing).") + if 'dates' in json_record: + if not isinstance(json_record['dates'], list): + errors.append("'dates' should be a list.") + else: + for date_entry in json_record['dates']: + if not isinstance(date_entry, dict) or 'dateType' not in date_entry or 'date' not in date_entry: + errors.append("Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys.") + + # Check for 'creators' + if 'creators' not in json_record: + errors.append("'creators' field is missing.") + elif not isinstance(json_record['creators'], list) or len(json_record['creators']) == 0: + errors.append("'creators' should be a non-empty list.") + else: + for creator in json_record['creators']: + if not isinstance(creator, dict) or 'name' not in creator: + errors.append("Each creator in 'creators' must be a dictionary with a 'name' key.") + + # Check for 'contributors' + if 'contributors' in json_record: + if not isinstance(json_record['contributors'], list): + errors.append("'contributors' should be a list.") + else: + for contributor in json_record['contributors']: + if not isinstance(contributor, dict) or 'name' not in contributor: + errors.append("Each contributor must be a dictionary with a 'name' key.") + + # Check for 'resourceType' + if 'resourceType' not in json_record['types']: + errors.append("'resourceType' field is missing in 'types'.") + elif not isinstance(json_record['types']['resourceType'], str): + errors.append("'resourceType' should be a string.") + + # Check for 'identifiers' + if 'identifiers' in json_record: + if not isinstance(json_record['identifiers'], list): + errors.append("'identifiers' should be a list.") + else: + for identifier in json_record['identifiers']: + if not isinstance(identifier, dict) or 'identifier' not in identifier or 'identifierType' not in identifier: + errors.append("Each identifier must be a dictionary with 'identifier' and 'identifierType' keys.") + + # Check for 'subjects' + if 'subjects' in json_record: + if not isinstance(json_record['subjects'], list): + errors.append("'subjects' should be a list.") + else: + for subject in json_record['subjects']: + if not isinstance(subject, dict) or 'subject' not in subject: + errors.append("Each subject must be a dictionary with a 'subject' key.") + + # Check for 'relatedIdentifiers' + if 'relatedIdentifiers' in json_record: + if not isinstance(json_record['relatedIdentifiers'], list): + errors.append("'relatedIdentifiers' should be a list.") + else: + for related_id in json_record['relatedIdentifiers']: + if not isinstance(related_id, dict) or 'relatedIdentifier' not in related_id: + errors.append("Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key.") + + # Check for 'rightsList' + if 'rightsList' in json_record: + if not isinstance(json_record['rightsList'], list): + errors.append("'rightsList' should be a list.") + else: + for rights in json_record['rightsList']: + if not isinstance(rights, dict) or 'rights' not in rights: + errors.append("Each entry in 'rightsList' must be a dictionary with a 'rights' key.") + + # Check for 'geoLocations' + if 'geoLocations' in json_record: + if not isinstance(json_record['geoLocations'], list): + errors.append("'geoLocations' should be a list.") + else: + for location in json_record['geoLocations']: + if not isinstance(location, dict): + errors.append("Each entry in 'geoLocations' must be a dictionary.") + elif 'geoLocationPoint' not in location and 'geoLocationBox' not in location and 'geoLocationPlace' not in location: + errors.append("Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'.") + + # Check for 'fundingReferences' + if 'fundingReferences' in json_record: + if not isinstance(json_record['fundingReferences'], list): + errors.append("'fundingReferences' should be a list.") + else: + for funding in json_record['fundingReferences']: + if not isinstance(funding, dict): + errors.append("Each funding reference must be a dictionary.") + if 'funderName' not in funding: + errors.append("Each funding reference must contain 'funderName'.") + + # Return errors if any are found + if errors: + raise ValueError(f"Validation errors in metadata: {', '.join(errors)}") if __name__ == "__main__": # Read in from file for demo purposes From f46700a919708cff55dad0491daa6a5b57d438e7 Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 22:52:02 +0545 Subject: [PATCH 6/9] Update customize_schema.py --- caltechdata_api/customize_schema.py | 154 ++++++++++++++++++---------- 1 file changed, 97 insertions(+), 57 deletions(-) diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py index 1e7e18c..b3ff9ab 100644 --- a/caltechdata_api/customize_schema.py +++ b/caltechdata_api/customize_schema.py @@ -134,7 +134,7 @@ def rdm_creators_contributors(person_list, peopleroles): def customize_schema_rdm(json_record): # Get vocabularies used in InvenioRDM - + vocabularies = get_vocabularies() validate_metadata(json_record) peopleroles = vocabularies["crr"] @@ -386,6 +386,7 @@ def customize_schema_rdm(json_record): return final + def validate_metadata(json_record): """ Validates the presence and structure of required fields in a CaltechDATA JSON record. @@ -394,122 +395,161 @@ def validate_metadata(json_record): errors = [] # Check for 'types' and 'resourceTypeGeneral' - if 'types' not in json_record: + if "types" not in json_record: errors.append("'types' field is missing.") - elif not isinstance(json_record['types'], dict): + elif not isinstance(json_record["types"], dict): errors.append("'types' field should be a dictionary.") - elif 'resourceTypeGeneral' not in json_record['types']: + elif "resourceTypeGeneral" not in json_record["types"]: errors.append("'resourceTypeGeneral' field is missing in 'types'.") # Check for 'title' - if 'titles' not in json_record: + if "titles" not in json_record: errors.append("'titles' field is missing.") - elif not isinstance(json_record['titles'], list) or len(json_record['titles']) == 0: + elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0: errors.append("'titles' should be a non-empty list.") else: # Ensure each title is a dictionary with 'title' field - for title in json_record['titles']: - if not isinstance(title, dict) or 'title' not in title: - errors.append("Each entry in 'titles' must be a dictionary with a 'title' key.") + for title in json_record["titles"]: + if not isinstance(title, dict) or "title" not in title: + errors.append( + "Each entry in 'titles' must be a dictionary with a 'title' key." + ) # Check for 'publication_date' - if 'publicationYear' not in json_record and 'dates' not in json_record: - errors.append("A publication date is required ('publicationYear' or 'dates' field is missing).") - if 'dates' in json_record: - if not isinstance(json_record['dates'], list): + if "publicationYear" not in json_record and "dates" not in json_record: + errors.append( + "A publication date is required ('publicationYear' or 'dates' field is missing)." + ) + if "dates" in json_record: + if not isinstance(json_record["dates"], list): errors.append("'dates' should be a list.") else: - for date_entry in json_record['dates']: - if not isinstance(date_entry, dict) or 'dateType' not in date_entry or 'date' not in date_entry: - errors.append("Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys.") + for date_entry in json_record["dates"]: + if ( + not isinstance(date_entry, dict) + or "dateType" not in date_entry + or "date" not in date_entry + ): + errors.append( + "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys." + ) # Check for 'creators' - if 'creators' not in json_record: + if "creators" not in json_record: errors.append("'creators' field is missing.") - elif not isinstance(json_record['creators'], list) or len(json_record['creators']) == 0: + elif ( + not isinstance(json_record["creators"], list) + or len(json_record["creators"]) == 0 + ): errors.append("'creators' should be a non-empty list.") else: - for creator in json_record['creators']: - if not isinstance(creator, dict) or 'name' not in creator: - errors.append("Each creator in 'creators' must be a dictionary with a 'name' key.") + for creator in json_record["creators"]: + if not isinstance(creator, dict) or "name" not in creator: + errors.append( + "Each creator in 'creators' must be a dictionary with a 'name' key." + ) # Check for 'contributors' - if 'contributors' in json_record: - if not isinstance(json_record['contributors'], list): + if "contributors" in json_record: + if not isinstance(json_record["contributors"], list): errors.append("'contributors' should be a list.") else: - for contributor in json_record['contributors']: - if not isinstance(contributor, dict) or 'name' not in contributor: - errors.append("Each contributor must be a dictionary with a 'name' key.") + for contributor in json_record["contributors"]: + if not isinstance(contributor, dict) or "name" not in contributor: + errors.append( + "Each contributor must be a dictionary with a 'name' key." + ) # Check for 'resourceType' - if 'resourceType' not in json_record['types']: + if "resourceType" not in json_record["types"]: errors.append("'resourceType' field is missing in 'types'.") - elif not isinstance(json_record['types']['resourceType'], str): + elif not isinstance(json_record["types"]["resourceType"], str): errors.append("'resourceType' should be a string.") # Check for 'identifiers' - if 'identifiers' in json_record: - if not isinstance(json_record['identifiers'], list): + if "identifiers" in json_record: + if not isinstance(json_record["identifiers"], list): errors.append("'identifiers' should be a list.") else: - for identifier in json_record['identifiers']: - if not isinstance(identifier, dict) or 'identifier' not in identifier or 'identifierType' not in identifier: - errors.append("Each identifier must be a dictionary with 'identifier' and 'identifierType' keys.") + for identifier in json_record["identifiers"]: + if ( + not isinstance(identifier, dict) + or "identifier" not in identifier + or "identifierType" not in identifier + ): + errors.append( + "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys." + ) # Check for 'subjects' - if 'subjects' in json_record: - if not isinstance(json_record['subjects'], list): + if "subjects" in json_record: + if not isinstance(json_record["subjects"], list): errors.append("'subjects' should be a list.") else: - for subject in json_record['subjects']: - if not isinstance(subject, dict) or 'subject' not in subject: - errors.append("Each subject must be a dictionary with a 'subject' key.") + for subject in json_record["subjects"]: + if not isinstance(subject, dict) or "subject" not in subject: + errors.append( + "Each subject must be a dictionary with a 'subject' key." + ) # Check for 'relatedIdentifiers' - if 'relatedIdentifiers' in json_record: - if not isinstance(json_record['relatedIdentifiers'], list): + if "relatedIdentifiers" in json_record: + if not isinstance(json_record["relatedIdentifiers"], list): errors.append("'relatedIdentifiers' should be a list.") else: - for related_id in json_record['relatedIdentifiers']: - if not isinstance(related_id, dict) or 'relatedIdentifier' not in related_id: - errors.append("Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key.") + for related_id in json_record["relatedIdentifiers"]: + if ( + not isinstance(related_id, dict) + or "relatedIdentifier" not in related_id + ): + errors.append( + "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key." + ) # Check for 'rightsList' - if 'rightsList' in json_record: - if not isinstance(json_record['rightsList'], list): + if "rightsList" in json_record: + if not isinstance(json_record["rightsList"], list): errors.append("'rightsList' should be a list.") else: - for rights in json_record['rightsList']: - if not isinstance(rights, dict) or 'rights' not in rights: - errors.append("Each entry in 'rightsList' must be a dictionary with a 'rights' key.") + for rights in json_record["rightsList"]: + if not isinstance(rights, dict) or "rights" not in rights: + errors.append( + "Each entry in 'rightsList' must be a dictionary with a 'rights' key." + ) # Check for 'geoLocations' - if 'geoLocations' in json_record: - if not isinstance(json_record['geoLocations'], list): + if "geoLocations" in json_record: + if not isinstance(json_record["geoLocations"], list): errors.append("'geoLocations' should be a list.") else: - for location in json_record['geoLocations']: + for location in json_record["geoLocations"]: if not isinstance(location, dict): errors.append("Each entry in 'geoLocations' must be a dictionary.") - elif 'geoLocationPoint' not in location and 'geoLocationBox' not in location and 'geoLocationPlace' not in location: - errors.append("Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'.") + elif ( + "geoLocationPoint" not in location + and "geoLocationBox" not in location + and "geoLocationPlace" not in location + ): + errors.append( + "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'." + ) # Check for 'fundingReferences' - if 'fundingReferences' in json_record: - if not isinstance(json_record['fundingReferences'], list): + if "fundingReferences" in json_record: + if not isinstance(json_record["fundingReferences"], list): errors.append("'fundingReferences' should be a list.") else: - for funding in json_record['fundingReferences']: + for funding in json_record["fundingReferences"]: if not isinstance(funding, dict): errors.append("Each funding reference must be a dictionary.") - if 'funderName' not in funding: + if "funderName" not in funding: errors.append("Each funding reference must contain 'funderName'.") # Return errors if any are found if errors: raise ValueError(f"Validation errors in metadata: {', '.join(errors)}") + if __name__ == "__main__": # Read in from file for demo purposes From d62c2781f35f5ad5d54366560b19b33ac168045f Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 22:52:40 +0545 Subject: [PATCH 7/9] Update cli.py --- caltechdata_api/cli.py | 92 +++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 37 deletions(-) diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py index 2286497..cd7cb26 100644 --- a/caltechdata_api/cli.py +++ b/caltechdata_api/cli.py @@ -89,7 +89,6 @@ def get_or_set_token(production=True): print("Tokens do not match. Please try again.") - def welcome_message(): print("Welcome to CaltechDATA CLI") @@ -383,22 +382,22 @@ def upload_data_from_file(): except json.JSONDecodeError as e: print(f"Error: Invalid JSON format in the file '{filename}'. {str(e)}") + def parse_args(): """Parse command-line arguments.""" parser = argparse.ArgumentParser(description="CaltechDATA CLI tool.") parser.add_argument( - "-test", - action="store_true", - help="Use test mode, sets production to False" + "-test", action="store_true", help="Use test mode, sets production to False" ) args = parser.parse_args() return args + def main(): args = parse_args() - + production = not args.test # Set production to False if -test flag is provided - + choice = get_user_input( "Do you want to create or edit a CaltechDATA record? (create/edit): " ).lower() @@ -412,7 +411,7 @@ def main(): def create_record(production): token = get_or_set_token(production) - #keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes" + # keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes" print("Using CaltechDATA token:", token) while True: choice = get_user_input( @@ -424,7 +423,11 @@ def create_record(production): if existing_data: if filepath != "": response = caltechdata_write( - existing_data, token, filepath, production=production, publish=False + existing_data, + token, + filepath, + production=production, + publish=False, ) elif file_link != "": response = caltechdata_write( @@ -501,7 +504,6 @@ def create_record(production): metadata, token, production=production, publish=False ) rec_id = response - print_upload_message(rec_id, production) with open(response + ".json", "w") as file: @@ -512,8 +514,13 @@ def create_record(production): else: print("Invalid choice. Please enter 'existing' or 'create'.") + def print_upload_message(rec_id, production): - base_url = "https://data.caltech.edu/uploads/" if production else "https://data.caltechlibrary.dev/uploads/" + base_url = ( + "https://data.caltech.edu/uploads/" + if production + else "https://data.caltechlibrary.dev/uploads/" + ) print( f"""You can view and publish this record at {base_url}{rec_id} @@ -521,11 +528,12 @@ def print_upload_message(rec_id, production): `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`""" ) + def edit_record(production): record_id = input("Enter the CaltechDATA record ID: ") token = get_or_set_token(production) file_name = download_file_by_id(record_id, token) - + if file_name: try: # Read the edited metadata file @@ -546,16 +554,22 @@ def edit_record(production): if choice == "y": if production: API_URL_TEMPLATE = "https://data.caltech.edu/api/records/{record_id}/files" - API_URL_TEMPLATE_DRAFT = "https://data.caltech.edu/api/records/{record_id}/draft/files" + API_URL_TEMPLATE_DRAFT = ( + "https://data.caltech.edu/api/records/{record_id}/draft/files" + ) else: - API_URL_TEMPLATE = "https://data.caltechlibrary.dev/api/records/{record_id}/files" - API_URL_TEMPLATE_DRAFT = "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files" - + API_URL_TEMPLATE = ( + "https://data.caltechlibrary.dev/api/records/{record_id}/files" + ) + API_URL_TEMPLATE_DRAFT = ( + "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files" + ) + url = API_URL_TEMPLATE.format(record_id=record_id) url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id) - + headers = { - "accept": "application/json", + "accept": "application/json", } if token: @@ -563,30 +577,35 @@ def edit_record(production): response = requests.get(url, headers=headers) response_draft = requests.get(url_draft, headers=headers) - - #print(production, response, response_draft) - #print(response.status_code, response_draft.status_code) + + # print(production, response, response_draft) + # print(response.status_code, response_draft.status_code) data = response.json() data_draft = response_draft.json() - #print(data_draft) + # print(data_draft) # Check if 'entries' exists and its length - if len(data.get('entries', [])) == 0 and len(data_draft.get('entries', [])) == 0: + if ( + len(data.get("entries", [])) == 0 + and len(data_draft.get("entries", [])) == 0 + ): keepfile = False else: - keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y" - + keepfile = ( + input("Do you want to keep existing files? (y/n): ").lower() == "y" + ) + # if response.status_code == 404 and response_draft.status_code == 404: # keepfile = False # else: - + # keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y" - + filepath, file_link = upload_supporting_file(record_id) if file_link: print(file_link) - + if filepath != "": response = caltechdata_edit( record_id, @@ -604,14 +623,11 @@ def edit_record(production): file_links=file_link, production=production, publish=False, - keepfile=keepfile, + keepfiles=keepfile, ) - + rec_id = response print_upload_message(rec_id, production) - - - def download_file_by_id(record_id, token=None): @@ -632,12 +648,12 @@ def download_file_by_id(record_id, token=None): url + "/draft", headers=headers, ) - if response.status_code != 200: + if response.status_code != 200: url = f"https://data.caltechlibrary.dev/api/records/{record_id}" response = requests.get( - url, - headers=headers, - ) + url, + headers=headers, + ) if response.status_code != 200: # Might have a draft response = requests.get( @@ -645,7 +661,9 @@ def download_file_by_id(record_id, token=None): headers=headers, ) if response.status_code != 200: - raise Exception(f"Record {record_id} does not exist, cannot edit") + raise Exception( + f"Record {record_id} does not exist, cannot edit" + ) file_content = response.content file_name = f"downloaded_data_{record_id}.json" with open(file_name, "wb") as file: From a57a075f922e6e660e5c3dc5ff0c1117bbe7a74f Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 22:53:15 +0545 Subject: [PATCH 8/9] Update caltechdata_write.py --- caltechdata_api/caltechdata_write.py | 1 - 1 file changed, 1 deletion(-) diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py index 1b25f80..e0cb0dd 100644 --- a/caltechdata_api/caltechdata_write.py +++ b/caltechdata_api/caltechdata_write.py @@ -63,7 +63,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal raise Exception(result.text) - def add_file_links( metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None ): From f0f40e3631b5c840f15d4db695e033eee8b4457a Mon Sep 17 00:00:00 2001 From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com> Date: Thu, 26 Sep 2024 22:56:28 +0545 Subject: [PATCH 9/9] Update cli.py --- caltechdata_api/cli.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py index cd7cb26..3222c09 100644 --- a/caltechdata_api/cli.py +++ b/caltechdata_api/cli.py @@ -577,14 +577,8 @@ def edit_record(production): response = requests.get(url, headers=headers) response_draft = requests.get(url_draft, headers=headers) - - # print(production, response, response_draft) - # print(response.status_code, response_draft.status_code) - data = response.json() data_draft = response_draft.json() - - # print(data_draft) # Check if 'entries' exists and its length if ( len(data.get("entries", [])) == 0 @@ -596,12 +590,6 @@ def edit_record(production): input("Do you want to keep existing files? (y/n): ").lower() == "y" ) - # if response.status_code == 404 and response_draft.status_code == 404: - # keepfile = False - # else: - - # keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y" - filepath, file_link = upload_supporting_file(record_id) if file_link: print(file_link)