From ba1be711bd6b947ff8eed7705ea7721e4c5e58d4 Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Tue, 24 Sep 2024 01:23:48 +0545
Subject: [PATCH 1/9] Update cli.py

---
 caltechdata_api/cli.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
index 4fbdea9..ed4a7a6 100644
--- a/caltechdata_api/cli.py
+++ b/caltechdata_api/cli.py
@@ -60,10 +60,13 @@ def decrypt_token(encrypted_token, key):
 
 
 # Function to get or set token
-def get_or_set_token():
-
+def get_or_set_token(production=True):
     key = load_or_generate_key()
-    token_file = os.path.join(caltechdata_directory, "token.txt")
+    
+    # Use different token files for production and test environments
+    token_filename = "token.txt" if production else "token_test.txt"
+    token_file = os.path.join(caltechdata_directory, token_filename)
+
     try:
         with open(token_file, "rb") as f:
             encrypted_token = f.read()
@@ -71,8 +74,8 @@ def get_or_set_token():
             return token
     except FileNotFoundError:
         while True:
-            token = input("Enter your CaltechDATA token: ").strip()
-            confirm_token = input("Confirm your CaltechDATA token: ").strip()
+            token = input(f"Enter your {'Production' if production else 'Test'} CaltechDATA token: ").strip()
+            confirm_token = input(f"Confirm your {'Production' if production else 'Test'} CaltechDATA token: ").strip()
             if token == confirm_token:
                 encrypted_token = encrypt_token(token, key)
                 with open(token_file, "wb") as f:
@@ -403,7 +406,7 @@ def main():
 
 
 def create_record(production):
-    token = get_or_set_token()
+    token = get_or_set_token(production)
     print("Using CaltechDATA token:", token)
     while True:
         choice = get_user_input(
@@ -526,7 +529,7 @@ def print_upload_message(rec_id, production):
 
 def edit_record(production):
     record_id = input("Enter the CaltechDATA record ID: ")
-    token = get_or_set_token()
+    token = get_or_set_token(production)
     file_name = download_file_by_id(record_id, token)
 
     if file_name:

From 1bff778de59dbdd834fb7aa948b8339d09576340 Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Tue, 24 Sep 2024 02:03:26 +0545
Subject: [PATCH 2/9] Update caltechdata_write.py

---
 caltechdata_api/caltechdata_write.py | 36 +++++++++++-----------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
index 68a1da9..d14d80d 100644
--- a/caltechdata_api/caltechdata_write.py
+++ b/caltechdata_api/caltechdata_write.py
@@ -1,7 +1,7 @@
 import copy
 import json
-import os, requests
-
+import os
+import requests
 import s3fs
 from requests import session
 from json.decoder import JSONDecodeError
@@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
                 infile = open(name, "rb")
             else:
                 infile = open(f_list[name], "rb")
-            # size = infile.seek(0, 2)
-            # infile.seek(0, 0)  # reset at beginning
             result = requests.put(link, headers=f_headers, data=infile)
             if result.status_code != 200:
                 raise Exception(result.text)
@@ -65,10 +63,11 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
                     raise Exception(result.text)
 
 
+
 def add_file_links(
     metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
 ):
-    # Currently configured for S3 links, assuming all are at same endpoint
+    # Currently configured for S3 links, assuming all are at the same endpoint
     link_string = ""
     endpoint = "https://" + file_links[0].split("/")[2]
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
@@ -152,13 +151,8 @@ def caltechdata_write(
     s3_link=None,
     default_preview=None,
     review_message=None,
+    keep_file=False,  # New parameter
 ):
-    """
-    File links are links to files existing in external systems that will
-    be added directly in a CaltechDATA record, instead of uploading the file.
-
-    S3 is a s3sf object for directly opening files
-    """
     # Make a copy so that none of our changes leak out
     metadata = copy.deepcopy(metadata)
 
@@ -167,7 +161,7 @@ def caltechdata_write(
         token = os.environ["RDMTOK"]
 
     # If files is a string - change to single value array
-    if isinstance(files, str) == True:
+    if isinstance(files, str):
         files = [files]
 
     if file_links:
@@ -176,14 +170,13 @@ def caltechdata_write(
         )
 
     # Pull out pid information
-    if production == True:
+    if production:
         repo_prefix = "10.22002"
     else:
         repo_prefix = "10.33569"
     pids = {}
     identifiers = []
     if "metadata" in metadata:
-        # we have rdm schema
         if "identifiers" in metadata["metadata"]:
             identifiers = metadata["metadata"]["identifiers"]
     elif "identifiers" in metadata:
@@ -200,11 +193,10 @@ def caltechdata_write(
                     "provider": "oai",
                 }
         elif "scheme" in identifier:
-            # We have RDM internal metadata
             if identifier["scheme"] == "doi":
                 doi = identifier["identifier"]
                 prefix = doi.split("/")[0]
-        if doi != False:
+        if doi:
             if prefix == repo_prefix:
                 pids["doi"] = {
                     "identifier": doi,
@@ -220,25 +212,25 @@ def caltechdata_write(
     if "pids" not in metadata:
         metadata["pids"] = pids
 
-    if authors == False:
+    if not authors:
         data = customize_schema.customize_schema(metadata, schema=schema)
-        if production == True:
+        if production:
             url = "https://data.caltech.edu/"
         else:
             url = "https://data.caltechlibrary.dev/"
     else:
         data = metadata
-        if production == True:
+        if production:
             url = "https://authors.library.caltech.edu/"
         else:
             url = "https://authors.caltechlibrary.dev/"
 
     headers = {
-        "Authorization": "Bearer %s" % token,
+        "Authorization": f"Bearer {token}",
         "Content-type": "application/json",
     }
     f_headers = {
-        "Authorization": "Bearer %s" % token,
+        "Authorization": f"Bearer {token}",
         "Content-type": "application/octet-stream",
     }
 
@@ -256,7 +248,7 @@ def caltechdata_write(
 
     if files:
         file_link = result.json()["links"]["files"]
-        write_files_rdm(files, file_link, headers, f_headers, s3)
+        write_files_rdm(files, file_link, headers, f_headers, s3, keep_file)
 
     if community:
         review_link = result.json()["links"]["review"]

From a31d86f1c6c67648ff8aca705e1e6f697470777d Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Tue, 24 Sep 2024 22:11:46 +0545
Subject: [PATCH 3/9] Update caltechdata_write.py

---
 caltechdata_api/caltechdata_write.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
index d14d80d..1b25f80 100644
--- a/caltechdata_api/caltechdata_write.py
+++ b/caltechdata_api/caltechdata_write.py
@@ -151,8 +151,13 @@ def caltechdata_write(
     s3_link=None,
     default_preview=None,
     review_message=None,
-    keep_file=False,  # New parameter
 ):
+    """
+    File links are links to files existing in external systems that will
+    be added directly in a CaltechDATA record, instead of uploading the file.
+
+    S3 is a s3sf object for directly opening files
+    """
     # Make a copy so that none of our changes leak out
     metadata = copy.deepcopy(metadata)
 
@@ -161,7 +166,7 @@ def caltechdata_write(
         token = os.environ["RDMTOK"]
 
     # If files is a string - change to single value array
-    if isinstance(files, str):
+    if isinstance(files, str) == True:
         files = [files]
 
     if file_links:
@@ -170,13 +175,14 @@ def caltechdata_write(
         )
 
     # Pull out pid information
-    if production:
+    if production == True:
         repo_prefix = "10.22002"
     else:
         repo_prefix = "10.33569"
     pids = {}
     identifiers = []
     if "metadata" in metadata:
+        # we have rdm schema
         if "identifiers" in metadata["metadata"]:
             identifiers = metadata["metadata"]["identifiers"]
     elif "identifiers" in metadata:
@@ -193,10 +199,11 @@ def caltechdata_write(
                     "provider": "oai",
                 }
         elif "scheme" in identifier:
+            # We have RDM internal metadata
             if identifier["scheme"] == "doi":
                 doi = identifier["identifier"]
                 prefix = doi.split("/")[0]
-        if doi:
+        if doi != False:
             if prefix == repo_prefix:
                 pids["doi"] = {
                     "identifier": doi,
@@ -212,25 +219,25 @@ def caltechdata_write(
     if "pids" not in metadata:
         metadata["pids"] = pids
 
-    if not authors:
+    if authors == False:
         data = customize_schema.customize_schema(metadata, schema=schema)
-        if production:
+        if production == True:
             url = "https://data.caltech.edu/"
         else:
             url = "https://data.caltechlibrary.dev/"
     else:
         data = metadata
-        if production:
+        if production == True:
             url = "https://authors.library.caltech.edu/"
         else:
             url = "https://authors.caltechlibrary.dev/"
 
     headers = {
-        "Authorization": f"Bearer {token}",
+        "Authorization": "Bearer %s" % token,
         "Content-type": "application/json",
     }
     f_headers = {
-        "Authorization": f"Bearer {token}",
+        "Authorization": "Bearer %s" % token,
         "Content-type": "application/octet-stream",
     }
 
@@ -248,7 +255,7 @@ def caltechdata_write(
 
     if files:
         file_link = result.json()["links"]["files"]
-        write_files_rdm(files, file_link, headers, f_headers, s3, keep_file)
+        write_files_rdm(files, file_link, headers, f_headers, s3)
 
     if community:
         review_link = result.json()["links"]["review"]

From 8b742ccf532c42f0df0dec22f1d91cf786131d58 Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 01:22:42 +0545
Subject: [PATCH 4/9] Update cli.py

---
 caltechdata_api/cli.py | 115 +++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 49 deletions(-)

diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
index ed4a7a6..e63db47 100644
--- a/caltechdata_api/cli.py
+++ b/caltechdata_api/cli.py
@@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key):
     return f.decrypt(encrypted_token).decode()
 
 
-# Function to get or set token
+# Function to get or set token with support for test system
 def get_or_set_token(production=True):
     key = load_or_generate_key()
     
@@ -85,6 +85,7 @@ def get_or_set_token(production=True):
                 print("Tokens do not match. Please try again.")
 
 
+
 def welcome_message():
     print("Welcome to CaltechDATA CLI")
 
@@ -378,22 +379,22 @@ def upload_data_from_file():
             except json.JSONDecodeError as e:
                 print(f"Error: Invalid JSON format in the file '{filename}'. {str(e)}")
 
-
 def parse_args():
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="CaltechDATA CLI tool.")
     parser.add_argument(
-        "-test", action="store_true", help="Use test mode, sets production to False"
+        "-test", 
+        action="store_true", 
+        help="Use test mode, sets production to False"
     )
     args = parser.parse_args()
     return args
 
-
 def main():
     args = parse_args()
-
+    
     production = not args.test  # Set production to False if -test flag is provided
-
+    
     choice = get_user_input(
         "Do you want to create or edit a CaltechDATA record? (create/edit): "
     ).lower()
@@ -407,6 +408,7 @@ def main():
 
 def create_record(production):
     token = get_or_set_token(production)
+    #keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
     print("Using CaltechDATA token:", token)
     while True:
         choice = get_user_input(
@@ -418,11 +420,7 @@ def create_record(production):
             if existing_data:
                 if filepath != "":
                     response = caltechdata_write(
-                        existing_data,
-                        token,
-                        filepath,
-                        production=production,
-                        publish=False,
+                        existing_data, token, filepath, production=production, publish=False
                     )
                 elif file_link != "":
                     response = caltechdata_write(
@@ -499,6 +497,7 @@ def create_record(production):
                         metadata, token, production=production, publish=False
                     )
                 rec_id = response
+                
 
                 print_upload_message(rec_id, production)
                 with open(response + ".json", "w") as file:
@@ -509,29 +508,20 @@ def create_record(production):
         else:
             print("Invalid choice. Please enter 'existing' or 'create'.")
 
-
 def print_upload_message(rec_id, production):
-    base_url = (
-        "https://data.caltech.edu/uploads/"
-        if production
-        else "https://data.caltechlibrary.dev/uploads/"
-    )
+    base_url = "https://data.caltech.edu/uploads/" if production else "https://data.caltechlibrary.dev/uploads/"
     print(
-        f"""
-        You can view and publish this record at
-        
+        f"""You can view and publish this record at
         {base_url}{rec_id}
-        
-        If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`
-        """
+        If you need to upload large files to S3, you can type
+        `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
     )
 
-
 def edit_record(production):
     record_id = input("Enter the CaltechDATA record ID: ")
     token = get_or_set_token(production)
     file_name = download_file_by_id(record_id, token)
-
+    
     if file_name:
         try:
             # Read the edited metadata file
@@ -548,38 +538,51 @@ def edit_record(production):
             print(f"An error occurred during metadata editing: {e}")
     else:
         print("No metadata file found.")
-
     choice = get_user_input("Do you want to add files? (y/n): ").lower()
     if choice == "y":
         if production:
             API_URL_TEMPLATE = "https://data.caltech.edu/api/records/{record_id}/files"
-            API_URL_TEMPLATE_DRAFT = (
-                "https://data.caltech.edu/api/records/{record_id}/draft/files"
-            )
+            API_URL_TEMPLATE_DRAFT = "https://data.caltech.edu/api/records/{record_id}/draft/files"
         else:
-            API_URL_TEMPLATE = (
-                "https://data.caltechlibrary.dev/api/records/{record_id}/files"
-            )
-            API_URL_TEMPLATE_DRAFT = (
-                "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files"
-            )
-
+            API_URL_TEMPLATE = "https://data.caltechlibrary.dev/api/records/{record_id}/files"
+            API_URL_TEMPLATE_DRAFT = "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files"
+        
         url = API_URL_TEMPLATE.format(record_id=record_id)
         url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)
+        
+        headers = {
+        "accept": "application/json",
+        }
 
-        response = requests.get(url)
-        response_draft = requests.get(url_draft)
+        if token:
+            headers["Authorization"] = "Bearer %s" % token
 
-        filepath, file_link = upload_supporting_file(record_id)
-        print(file_link)
+        response = requests.get(url, headers=headers)
+        response_draft = requests.get(url_draft, headers=headers)
+        
+        #print(production, response, response_draft)
+        #print(response.status_code, response_draft.status_code)
 
-        if response.status_code == 404 and response_draft.status_code == 404:
+        data = response.json()
+        data_draft = response_draft.json()
+
+        #print(data_draft)
+        # Check if 'entries' exists and its length
+        if len(data.get('entries', [])) == 0 and len(data_draft.get('entries', [])) == 0:
             keepfile = False
         else:
-            keepfile = (
-                input("Do you want to keep existing files? (y/n): ").lower() == "y"
-            )
-
+            keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"
+   
+        # if response.status_code == 404 and response_draft.status_code == 404:
+        #     keepfile = False
+        # else:
+            
+        #     keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"
+        
+        filepath, file_link = upload_supporting_file(record_id)
+        if file_link:
+            print(file_link)
+        
         if filepath != "":
             response = caltechdata_edit(
                 record_id,
@@ -599,9 +602,12 @@ def edit_record(production):
                 publish=False,
                 keepfile=keepfile,
             )
-
+        
         rec_id = response
         print_upload_message(rec_id, production)
+        
+        
+
 
 
 def download_file_by_id(record_id, token=None):
@@ -616,15 +622,26 @@ def download_file_by_id(record_id, token=None):
 
     try:
         response = requests.get(url, headers=headers)
-
         if response.status_code != 200:
             # Might have a draft
             response = requests.get(
                 url + "/draft",
                 headers=headers,
             )
-            if response.status_code != 200:
-                raise Exception(f"Record {record_id} does not exist, cannot edit")
+            if response.status_code != 200: 
+                url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
+                response = requests.get(
+                url,
+                headers=headers,
+            )
+                if response.status_code != 200:
+                    # Might have a draft
+                    response = requests.get(
+                        url + "/draft",
+                        headers=headers,
+                    )
+                    if response.status_code != 200:
+                        raise Exception(f"Record {record_id} does not exist, cannot edit")
         file_content = response.content
         file_name = f"downloaded_data_{record_id}.json"
         with open(file_name, "wb") as file:

From bf5ca3c98b93247e497637c0cd1b665995d7262d Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 02:14:25 +0545
Subject: [PATCH 5/9] Update customize_schema.py

---
 caltechdata_api/customize_schema.py | 126 +++++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 1 deletion(-)

diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
index c379e58..1e7e18c 100644
--- a/caltechdata_api/customize_schema.py
+++ b/caltechdata_api/customize_schema.py
@@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):
 
 def customize_schema_rdm(json_record):
     # Get vocabularies used in InvenioRDM
+    
     vocabularies = get_vocabularies()
-
+    validate_metadata(json_record)
     peopleroles = vocabularies["crr"]
     resourcetypes = vocabularies["rsrct"]
     descriptiontypes = vocabularies["dty"]
@@ -385,6 +386,129 @@ def customize_schema_rdm(json_record):
 
     return final
 
+def validate_metadata(json_record):
+    """
+    Validates the presence and structure of required fields in a CaltechDATA JSON record.
+    Raises an exception if any required field is missing or structured incorrectly.
+    """
+    errors = []
+
+    # Check for 'types' and 'resourceTypeGeneral'
+    if 'types' not in json_record:
+        errors.append("'types' field is missing.")
+    elif not isinstance(json_record['types'], dict):
+        errors.append("'types' field should be a dictionary.")
+    elif 'resourceTypeGeneral' not in json_record['types']:
+        errors.append("'resourceTypeGeneral' field is missing in 'types'.")
+
+    # Check for 'title'
+    if 'titles' not in json_record:
+        errors.append("'titles' field is missing.")
+    elif not isinstance(json_record['titles'], list) or len(json_record['titles']) == 0:
+        errors.append("'titles' should be a non-empty list.")
+    else:
+        # Ensure each title is a dictionary with 'title' field
+        for title in json_record['titles']:
+            if not isinstance(title, dict) or 'title' not in title:
+                errors.append("Each entry in 'titles' must be a dictionary with a 'title' key.")
+
+    # Check for 'publication_date'
+    if 'publicationYear' not in json_record and 'dates' not in json_record:
+        errors.append("A publication date is required ('publicationYear' or 'dates' field is missing).")
+    if 'dates' in json_record:
+        if not isinstance(json_record['dates'], list):
+            errors.append("'dates' should be a list.")
+        else:
+            for date_entry in json_record['dates']:
+                if not isinstance(date_entry, dict) or 'dateType' not in date_entry or 'date' not in date_entry:
+                    errors.append("Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys.")
+
+    # Check for 'creators'
+    if 'creators' not in json_record:
+        errors.append("'creators' field is missing.")
+    elif not isinstance(json_record['creators'], list) or len(json_record['creators']) == 0:
+        errors.append("'creators' should be a non-empty list.")
+    else:
+        for creator in json_record['creators']:
+            if not isinstance(creator, dict) or 'name' not in creator:
+                errors.append("Each creator in 'creators' must be a dictionary with a 'name' key.")
+
+    # Check for 'contributors'
+    if 'contributors' in json_record:
+        if not isinstance(json_record['contributors'], list):
+            errors.append("'contributors' should be a list.")
+        else:
+            for contributor in json_record['contributors']:
+                if not isinstance(contributor, dict) or 'name' not in contributor:
+                    errors.append("Each contributor must be a dictionary with a 'name' key.")
+
+    # Check for 'resourceType'
+    if 'resourceType' not in json_record['types']:
+        errors.append("'resourceType' field is missing in 'types'.")
+    elif not isinstance(json_record['types']['resourceType'], str):
+        errors.append("'resourceType' should be a string.")
+
+    # Check for 'identifiers'
+    if 'identifiers' in json_record:
+        if not isinstance(json_record['identifiers'], list):
+            errors.append("'identifiers' should be a list.")
+        else:
+            for identifier in json_record['identifiers']:
+                if not isinstance(identifier, dict) or 'identifier' not in identifier or 'identifierType' not in identifier:
+                    errors.append("Each identifier must be a dictionary with 'identifier' and 'identifierType' keys.")
+
+    # Check for 'subjects'
+    if 'subjects' in json_record:
+        if not isinstance(json_record['subjects'], list):
+            errors.append("'subjects' should be a list.")
+        else:
+            for subject in json_record['subjects']:
+                if not isinstance(subject, dict) or 'subject' not in subject:
+                    errors.append("Each subject must be a dictionary with a 'subject' key.")
+
+    # Check for 'relatedIdentifiers'
+    if 'relatedIdentifiers' in json_record:
+        if not isinstance(json_record['relatedIdentifiers'], list):
+            errors.append("'relatedIdentifiers' should be a list.")
+        else:
+            for related_id in json_record['relatedIdentifiers']:
+                if not isinstance(related_id, dict) or 'relatedIdentifier' not in related_id:
+                    errors.append("Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key.")
+
+    # Check for 'rightsList'
+    if 'rightsList' in json_record:
+        if not isinstance(json_record['rightsList'], list):
+            errors.append("'rightsList' should be a list.")
+        else:
+            for rights in json_record['rightsList']:
+                if not isinstance(rights, dict) or 'rights' not in rights:
+                    errors.append("Each entry in 'rightsList' must be a dictionary with a 'rights' key.")
+
+    # Check for 'geoLocations'
+    if 'geoLocations' in json_record:
+        if not isinstance(json_record['geoLocations'], list):
+            errors.append("'geoLocations' should be a list.")
+        else:
+            for location in json_record['geoLocations']:
+                if not isinstance(location, dict):
+                    errors.append("Each entry in 'geoLocations' must be a dictionary.")
+                elif 'geoLocationPoint' not in location and 'geoLocationBox' not in location and 'geoLocationPlace' not in location:
+                    errors.append("Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'.")
+
+    # Check for 'fundingReferences'
+    if 'fundingReferences' in json_record:
+        if not isinstance(json_record['fundingReferences'], list):
+            errors.append("'fundingReferences' should be a list.")
+        else:
+            for funding in json_record['fundingReferences']:
+                if not isinstance(funding, dict):
+                    errors.append("Each funding reference must be a dictionary.")
+                if 'funderName' not in funding:
+                    errors.append("Each funding reference must contain 'funderName'.")
+
+    # Return errors if any are found
+    if errors:
+        raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")
 
 if __name__ == "__main__":
     # Read in from file for demo purposes

From f46700a919708cff55dad0491daa6a5b57d438e7 Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:52:02 +0545
Subject: [PATCH 6/9] Update customize_schema.py

---
 caltechdata_api/customize_schema.py | 154 ++++++++++++++++++----------
 1 file changed, 97 insertions(+), 57 deletions(-)

diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
index 1e7e18c..b3ff9ab 100644
--- a/caltechdata_api/customize_schema.py
+++ b/caltechdata_api/customize_schema.py
@@ -134,7 +134,7 @@ def rdm_creators_contributors(person_list, peopleroles):
 
 def customize_schema_rdm(json_record):
     # Get vocabularies used in InvenioRDM
-    
+
     vocabularies = get_vocabularies()
     validate_metadata(json_record)
     peopleroles = vocabularies["crr"]
@@ -386,6 +386,7 @@ def customize_schema_rdm(json_record):
 
     return final
 
+
 def validate_metadata(json_record):
     """
     Validates the presence and structure of required fields in a CaltechDATA JSON record.
@@ -394,122 +395,161 @@ def validate_metadata(json_record):
     errors = []
 
     # Check for 'types' and 'resourceTypeGeneral'
-    if 'types' not in json_record:
+    if "types" not in json_record:
         errors.append("'types' field is missing.")
-    elif not isinstance(json_record['types'], dict):
+    elif not isinstance(json_record["types"], dict):
         errors.append("'types' field should be a dictionary.")
-    elif 'resourceTypeGeneral' not in json_record['types']:
+    elif "resourceTypeGeneral" not in json_record["types"]:
         errors.append("'resourceTypeGeneral' field is missing in 'types'.")
 
     # Check for 'title'
-    if 'titles' not in json_record:
+    if "titles" not in json_record:
         errors.append("'titles' field is missing.")
-    elif not isinstance(json_record['titles'], list) or len(json_record['titles']) == 0:
+    elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0:
         errors.append("'titles' should be a non-empty list.")
     else:
         # Ensure each title is a dictionary with 'title' field
-        for title in json_record['titles']:
-            if not isinstance(title, dict) or 'title' not in title:
-                errors.append("Each entry in 'titles' must be a dictionary with a 'title' key.")
+        for title in json_record["titles"]:
+            if not isinstance(title, dict) or "title" not in title:
+                errors.append(
+                    "Each entry in 'titles' must be a dictionary with a 'title' key."
+                )
 
     # Check for 'publication_date'
-    if 'publicationYear' not in json_record and 'dates' not in json_record:
-        errors.append("A publication date is required ('publicationYear' or 'dates' field is missing).")
-    if 'dates' in json_record:
-        if not isinstance(json_record['dates'], list):
+    if "publicationYear" not in json_record and "dates" not in json_record:
+        errors.append(
+            "A publication date is required ('publicationYear' or 'dates' field is missing)."
+        )
+    if "dates" in json_record:
+        if not isinstance(json_record["dates"], list):
             errors.append("'dates' should be a list.")
         else:
-            for date_entry in json_record['dates']:
-                if not isinstance(date_entry, dict) or 'dateType' not in date_entry or 'date' not in date_entry:
-                    errors.append("Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys.")
+            for date_entry in json_record["dates"]:
+                if (
+                    not isinstance(date_entry, dict)
+                    or "dateType" not in date_entry
+                    or "date" not in date_entry
+                ):
+                    errors.append(
+                        "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
+                    )
 
     # Check for 'creators'
-    if 'creators' not in json_record:
+    if "creators" not in json_record:
         errors.append("'creators' field is missing.")
-    elif not isinstance(json_record['creators'], list) or len(json_record['creators']) == 0:
+    elif (
+        not isinstance(json_record["creators"], list)
+        or len(json_record["creators"]) == 0
+    ):
         errors.append("'creators' should be a non-empty list.")
     else:
-        for creator in json_record['creators']:
-            if not isinstance(creator, dict) or 'name' not in creator:
-                errors.append("Each creator in 'creators' must be a dictionary with a 'name' key.")
+        for creator in json_record["creators"]:
+            if not isinstance(creator, dict) or "name" not in creator:
+                errors.append(
+                    "Each creator in 'creators' must be a dictionary with a 'name' key."
+                )
 
     # Check for 'contributors'
-    if 'contributors' in json_record:
-        if not isinstance(json_record['contributors'], list):
+    if "contributors" in json_record:
+        if not isinstance(json_record["contributors"], list):
             errors.append("'contributors' should be a list.")
         else:
-            for contributor in json_record['contributors']:
-                if not isinstance(contributor, dict) or 'name' not in contributor:
-                    errors.append("Each contributor must be a dictionary with a 'name' key.")
+            for contributor in json_record["contributors"]:
+                if not isinstance(contributor, dict) or "name" not in contributor:
+                    errors.append(
+                        "Each contributor must be a dictionary with a 'name' key."
+                    )
 
     # Check for 'resourceType'
-    if 'resourceType' not in json_record['types']:
+    if "resourceType" not in json_record["types"]:
         errors.append("'resourceType' field is missing in 'types'.")
-    elif not isinstance(json_record['types']['resourceType'], str):
+    elif not isinstance(json_record["types"]["resourceType"], str):
         errors.append("'resourceType' should be a string.")
 
     # Check for 'identifiers'
-    if 'identifiers' in json_record:
-        if not isinstance(json_record['identifiers'], list):
+    if "identifiers" in json_record:
+        if not isinstance(json_record["identifiers"], list):
             errors.append("'identifiers' should be a list.")
         else:
-            for identifier in json_record['identifiers']:
-                if not isinstance(identifier, dict) or 'identifier' not in identifier or 'identifierType' not in identifier:
-                    errors.append("Each identifier must be a dictionary with 'identifier' and 'identifierType' keys.")
+            for identifier in json_record["identifiers"]:
+                if (
+                    not isinstance(identifier, dict)
+                    or "identifier" not in identifier
+                    or "identifierType" not in identifier
+                ):
+                    errors.append(
+                        "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
+                    )
 
     # Check for 'subjects'
-    if 'subjects' in json_record:
-        if not isinstance(json_record['subjects'], list):
+    if "subjects" in json_record:
+        if not isinstance(json_record["subjects"], list):
             errors.append("'subjects' should be a list.")
         else:
-            for subject in json_record['subjects']:
-                if not isinstance(subject, dict) or 'subject' not in subject:
-                    errors.append("Each subject must be a dictionary with a 'subject' key.")
+            for subject in json_record["subjects"]:
+                if not isinstance(subject, dict) or "subject" not in subject:
+                    errors.append(
+                        "Each subject must be a dictionary with a 'subject' key."
+                    )
 
     # Check for 'relatedIdentifiers'
-    if 'relatedIdentifiers' in json_record:
-        if not isinstance(json_record['relatedIdentifiers'], list):
+    if "relatedIdentifiers" in json_record:
+        if not isinstance(json_record["relatedIdentifiers"], list):
             errors.append("'relatedIdentifiers' should be a list.")
         else:
-            for related_id in json_record['relatedIdentifiers']:
-                if not isinstance(related_id, dict) or 'relatedIdentifier' not in related_id:
-                    errors.append("Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key.")
+            for related_id in json_record["relatedIdentifiers"]:
+                if (
+                    not isinstance(related_id, dict)
+                    or "relatedIdentifier" not in related_id
+                ):
+                    errors.append(
+                        "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
+                    )
 
     # Check for 'rightsList'
-    if 'rightsList' in json_record:
-        if not isinstance(json_record['rightsList'], list):
+    if "rightsList" in json_record:
+        if not isinstance(json_record["rightsList"], list):
             errors.append("'rightsList' should be a list.")
         else:
-            for rights in json_record['rightsList']:
-                if not isinstance(rights, dict) or 'rights' not in rights:
-                    errors.append("Each entry in 'rightsList' must be a dictionary with a 'rights' key.")
+            for rights in json_record["rightsList"]:
+                if not isinstance(rights, dict) or "rights" not in rights:
+                    errors.append(
+                        "Each entry in 'rightsList' must be a dictionary with a 'rights' key."
+                    )
 
     # Check for 'geoLocations'
-    if 'geoLocations' in json_record:
-        if not isinstance(json_record['geoLocations'], list):
+    if "geoLocations" in json_record:
+        if not isinstance(json_record["geoLocations"], list):
             errors.append("'geoLocations' should be a list.")
         else:
-            for location in json_record['geoLocations']:
+            for location in json_record["geoLocations"]:
                 if not isinstance(location, dict):
                     errors.append("Each entry in 'geoLocations' must be a dictionary.")
-                elif 'geoLocationPoint' not in location and 'geoLocationBox' not in location and 'geoLocationPlace' not in location:
-                    errors.append("Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'.")
+                elif (
+                    "geoLocationPoint" not in location
+                    and "geoLocationBox" not in location
+                    and "geoLocationPlace" not in location
+                ):
+                    errors.append(
+                        "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
+                    )
 
     # Check for 'fundingReferences'
-    if 'fundingReferences' in json_record:
-        if not isinstance(json_record['fundingReferences'], list):
+    if "fundingReferences" in json_record:
+        if not isinstance(json_record["fundingReferences"], list):
             errors.append("'fundingReferences' should be a list.")
         else:
-            for funding in json_record['fundingReferences']:
+            for funding in json_record["fundingReferences"]:
                 if not isinstance(funding, dict):
                     errors.append("Each funding reference must be a dictionary.")
-                if 'funderName' not in funding:
+                if "funderName" not in funding:
                     errors.append("Each funding reference must contain 'funderName'.")
 
     # Return errors if any are found
     if errors:
         raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")
 
+
 if __name__ == "__main__":
     # Read in from file for demo purposes
 

From d62c2781f35f5ad5d54366560b19b33ac168045f Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:52:40 +0545
Subject: [PATCH 7/9] Update cli.py

---
 caltechdata_api/cli.py | 92 +++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 37 deletions(-)

diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
index 2286497..cd7cb26 100644
--- a/caltechdata_api/cli.py
+++ b/caltechdata_api/cli.py
@@ -89,7 +89,6 @@ def get_or_set_token(production=True):
                 print("Tokens do not match. Please try again.")
 
 
-
 def welcome_message():
     print("Welcome to CaltechDATA CLI")
 
@@ -383,22 +382,22 @@ def upload_data_from_file():
             except json.JSONDecodeError as e:
                 print(f"Error: Invalid JSON format in the file '{filename}'. {str(e)}")
 
+
 def parse_args():
     """Parse command-line arguments."""
     parser = argparse.ArgumentParser(description="CaltechDATA CLI tool.")
     parser.add_argument(
-        "-test", 
-        action="store_true", 
-        help="Use test mode, sets production to False"
+        "-test", action="store_true", help="Use test mode, sets production to False"
     )
     args = parser.parse_args()
     return args
 
+
 def main():
     args = parse_args()
-    
+
     production = not args.test  # Set production to False if -test flag is provided
-    
+
     choice = get_user_input(
         "Do you want to create or edit a CaltechDATA record? (create/edit): "
     ).lower()
@@ -412,7 +411,7 @@ def main():
 
 def create_record(production):
     token = get_or_set_token(production)
-    #keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
+    # keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
     print("Using CaltechDATA token:", token)
     while True:
         choice = get_user_input(
@@ -424,7 +423,11 @@ def create_record(production):
             if existing_data:
                 if filepath != "":
                     response = caltechdata_write(
-                        existing_data, token, filepath, production=production, publish=False
+                        existing_data,
+                        token,
+                        filepath,
+                        production=production,
+                        publish=False,
                     )
                 elif file_link != "":
                     response = caltechdata_write(
@@ -501,7 +504,6 @@ def create_record(production):
                         metadata, token, production=production, publish=False
                     )
                 rec_id = response
-                
 
                 print_upload_message(rec_id, production)
                 with open(response + ".json", "w") as file:
@@ -512,8 +514,13 @@ def create_record(production):
         else:
             print("Invalid choice. Please enter 'existing' or 'create'.")
 
+
 def print_upload_message(rec_id, production):
-    base_url = "https://data.caltech.edu/uploads/" if production else "https://data.caltechlibrary.dev/uploads/"
+    base_url = (
+        "https://data.caltech.edu/uploads/"
+        if production
+        else "https://data.caltechlibrary.dev/uploads/"
+    )
     print(
         f"""You can view and publish this record at
         {base_url}{rec_id}
@@ -521,11 +528,12 @@ def print_upload_message(rec_id, production):
         `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
     )
 
+
 def edit_record(production):
     record_id = input("Enter the CaltechDATA record ID: ")
     token = get_or_set_token(production)
     file_name = download_file_by_id(record_id, token)
-    
+
     if file_name:
         try:
             # Read the edited metadata file
@@ -546,16 +554,22 @@ def edit_record(production):
     if choice == "y":
         if production:
             API_URL_TEMPLATE = "https://data.caltech.edu/api/records/{record_id}/files"
-            API_URL_TEMPLATE_DRAFT = "https://data.caltech.edu/api/records/{record_id}/draft/files"
+            API_URL_TEMPLATE_DRAFT = (
+                "https://data.caltech.edu/api/records/{record_id}/draft/files"
+            )
         else:
-            API_URL_TEMPLATE = "https://data.caltechlibrary.dev/api/records/{record_id}/files"
-            API_URL_TEMPLATE_DRAFT = "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files"
-        
+            API_URL_TEMPLATE = (
+                "https://data.caltechlibrary.dev/api/records/{record_id}/files"
+            )
+            API_URL_TEMPLATE_DRAFT = (
+                "https://data.caltechlibrary.dev/api/records/{record_id}/draft/files"
+            )
+
         url = API_URL_TEMPLATE.format(record_id=record_id)
         url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)
-        
+
         headers = {
-        "accept": "application/json",
+            "accept": "application/json",
         }
 
         if token:
@@ -563,30 +577,35 @@ def edit_record(production):
 
         response = requests.get(url, headers=headers)
         response_draft = requests.get(url_draft, headers=headers)
-        
-        #print(production, response, response_draft)
-        #print(response.status_code, response_draft.status_code)
+
+        # print(production, response, response_draft)
+        # print(response.status_code, response_draft.status_code)
 
         data = response.json()
         data_draft = response_draft.json()
 
-        #print(data_draft)
+        # print(data_draft)
         # Check if 'entries' exists and its length
-        if len(data.get('entries', [])) == 0 and len(data_draft.get('entries', [])) == 0:
+        if (
+            len(data.get("entries", [])) == 0
+            and len(data_draft.get("entries", [])) == 0
+        ):
             keepfile = False
         else:
-            keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"
-   
+            keepfile = (
+                input("Do you want to keep existing files? (y/n): ").lower() == "y"
+            )
+
         # if response.status_code == 404 and response_draft.status_code == 404:
         #     keepfile = False
         # else:
-            
+
         #     keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"
-        
+
         filepath, file_link = upload_supporting_file(record_id)
         if file_link:
             print(file_link)
-        
+
         if filepath != "":
             response = caltechdata_edit(
                 record_id,
@@ -604,14 +623,11 @@ def edit_record(production):
                 file_links=file_link,
                 production=production,
                 publish=False,
-                keepfile=keepfile,
+                keepfiles=keepfile,
             )
-        
+
         rec_id = response
         print_upload_message(rec_id, production)
-        
-        
-
 
 
 def download_file_by_id(record_id, token=None):
@@ -632,12 +648,12 @@ def download_file_by_id(record_id, token=None):
                 url + "/draft",
                 headers=headers,
             )
-            if response.status_code != 200: 
+            if response.status_code != 200:
                 url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
                 response = requests.get(
-                url,
-                headers=headers,
-            )
+                    url,
+                    headers=headers,
+                )
                 if response.status_code != 200:
                     # Might have a draft
                     response = requests.get(
@@ -645,7 +661,9 @@ def download_file_by_id(record_id, token=None):
                         headers=headers,
                     )
                     if response.status_code != 200:
-                        raise Exception(f"Record {record_id} does not exist, cannot edit")
+                        raise Exception(
+                            f"Record {record_id} does not exist, cannot edit"
+                        )
         file_content = response.content
         file_name = f"downloaded_data_{record_id}.json"
         with open(file_name, "wb") as file:

From a57a075f922e6e660e5c3dc5ff0c1117bbe7a74f Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:53:15 +0545
Subject: [PATCH 8/9] Update caltechdata_write.py

---
 caltechdata_api/caltechdata_write.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
index 1b25f80..e0cb0dd 100644
--- a/caltechdata_api/caltechdata_write.py
+++ b/caltechdata_api/caltechdata_write.py
@@ -63,7 +63,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
                     raise Exception(result.text)
 
 
-
 def add_file_links(
     metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
 ):

From f0f40e3631b5c840f15d4db695e033eee8b4457a Mon Sep 17 00:00:00 2001
From: RohanBhattaraiNP <152933030+RohanBhattaraiNP@users.noreply.github.com>
Date: Thu, 26 Sep 2024 22:56:28 +0545
Subject: [PATCH 9/9] Update cli.py

---
 caltechdata_api/cli.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
index cd7cb26..3222c09 100644
--- a/caltechdata_api/cli.py
+++ b/caltechdata_api/cli.py
@@ -577,14 +577,8 @@ def edit_record(production):
 
         response = requests.get(url, headers=headers)
         response_draft = requests.get(url_draft, headers=headers)
-
-        # print(production, response, response_draft)
-        # print(response.status_code, response_draft.status_code)
-
         data = response.json()
         data_draft = response_draft.json()
-
-        # print(data_draft)
         # Check if 'entries' exists and its length
         if (
             len(data.get("entries", [])) == 0
@@ -596,12 +590,6 @@ def edit_record(production):
                 input("Do you want to keep existing files? (y/n): ").lower() == "y"
             )
 
-        # if response.status_code == 404 and response_draft.status_code == 404:
-        #     keepfile = False
-        # else:
-
-        #     keepfile = input("Do you want to keep existing files? (y/n): ").lower() == "y"
-
         filepath, file_link = upload_supporting_file(record_id)
         if file_link:
             print(file_link)