caltechlibrary · tmorrell · Dec 20, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 13, 2024
diff --git a/.github/workflows/bot.yaml b/.github/workflows/bot.yaml
@@ -0,0 +1,57 @@
+name: Bot validation
+
+on:
+  push:
+    paths:
+      - 'caltechdata_api/cli.py'
+      - 'caltechdata_api/customize_schema.py'
+      - 'caltechdata_api/caltechdata_write.py'
+      - 'caltechdata_api/caltechdata_edit.py' 
+      - 'README.md'
+  pull_request:
+    paths:
+      - 'caltechdata_api/cli.py'
+      - 'caltechdata_api/customize_schema.py'
+      - 'caltechdata_api/caltechdata_write.py'
+      - 'caltechdata_api/caltechdata_edit.py' 
+      - 'README.md'
+
+jobs:
+  validate-metadata:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+
+    - name: Check for Required Environment Variables
+      env:
+        CALTECHDATA_TOKEN: ${{ secrets.CALTECHDATA_TOKEN }}
+      run: |
+        if [ -z "$CALTECHDATA_TOKEN" ]; then
+          echo "Error: CALTECHDATA_TOKEN environment variable is not set"
+          exit 1
+        fi
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest requests s3fs cryptography 
+        pip install .
+
+    - name: Run CaltechDATA Metadata Validation
+      env:
+        CALTECHDATA_TOKEN: ${{ secrets.CALTECHDATA_TOKEN }}
+      run: |
+        python tests/bot_yaml.py
+    - name: Run Unit Tests
+      run: |
+        cd tests
+        pytest test_unit.py
+        pytest test_rdm.py
diff --git a/CITATION.cff b/CITATION.cff
@@ -22,4 +22,4 @@ keywords:
   - metadata
   - software
   - InvenioRDM
-date-released: 2024-12-18
+
diff --git a/caltechdata_api/__init__.py b/caltechdata_api/__init__.py
@@ -9,7 +9,8 @@
     caltechdata_unembargo,
     caltechdata_accept,
 )
-from .customize_schema import customize_schema
+from .customize_schema import customize_schema, validate_metadata
 from .get_metadata import get_metadata
 from .download_file import download_file, download_url
 from .utils import humanbytes
+from .md_to_json import parse_readme_to_json
diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
@@ -1,8 +1,7 @@
 import argparse
 import requests
 import s3fs
-from caltechdata_api import caltechdata_write, caltechdata_edit
-from md_to_json import parse_readme_to_json
+from caltechdata_api import caltechdata_write, caltechdata_edit, parse_readme_to_json
 import json
 import os
 from cryptography.fernet import Fernet
@@ -34,7 +33,7 @@ def generate_key():
     return Fernet.generate_key()
 
 
-# Load the key from a file or generate a new one if not present
+# Load the key from a file or generate a new one if not present.
 def load_or_generate_key():
     key_file = os.path.join(caltechdata_directory, "key.key")
     if os.path.exists(key_file):
@@ -59,11 +58,17 @@ def decrypt_token(encrypted_token, key):
     return f.decrypt(encrypted_token).decode()
 
 
-# Function to get or set token with support for test system
+# Function to get or set token with support for test systems
 def get_or_set_token(production=True):
+    # First check for environment variable
+    env_token = os.environ.get("CALTECHDATA_TOKEN")
+    if env_token:
+        print("Using token from environment variable")
+        return env_token
+
     key = load_or_generate_key()
 
-    # Use different token files for production and test environments
+    # Use different token files for production and test environment
     token_filename = "token.txt" if production else "token_test.txt"
     token_file = os.path.join(caltechdata_directory, token_filename)
 

diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
@@ -480,18 +480,17 @@ def validate_metadata(json_record):
                     errors.append("Each 'subject' must have a 'subject' key.")
 
     # Check for 'dates'
-    if "dates" not in json_record:
-        errors.append("'dates' field is missing.")
-    elif not isinstance(json_record["dates"], list) or len(json_record["dates"]) == 0:
-        errors.append("'dates' should be a non-empty list.")
-    else:
-        for date in json_record["dates"]:
-            if (
-                not isinstance(date, dict)
-                or "date" not in date
-                or "dateType" not in date
-            ):
-                errors.append("Each 'date' must have 'date' and 'dateType'.")
+    if "dates" in json_record:
+        if not isinstance(json_record["dates"], list) or len(json_record["dates"]) == 0:
+            errors.append("'dates' should be a non-empty list.")
+        else:
+            for date in json_record["dates"]:
+                if (
+                    not isinstance(date, dict)
+                    or "date" not in date
+                    or "dateType" not in date
+                ):
+                    errors.append("Each 'date' must have 'date' and 'dateType'.")
 
     # Check for 'creators'
     if "creators" not in json_record:
@@ -601,10 +600,9 @@ def validate_metadata(json_record):
         errors.append("'publisher' should be a string.")
 
     # Check for 'publicationYear'
-    if "publicationYear" not in json_record:
-        errors.append("'publicationYear' field is missing.")
-    elif not isinstance(json_record["publicationYear"], str):
-        errors.append("'publicationYear' should be a string.")
+    if "publicationYear" in json_record:
+        if not isinstance(json_record["publicationYear"], str):
+            errors.append("'publicationYear' should be a string.")
 
     # Check for 'types'
     if "types" not in json_record:

diff --git a/tests/10.22002-D1.1098 b/tests/10.22002-D1.1098
diff --git a/tests/bot_yaml.py b/tests/bot_yaml.py
@@ -0,0 +1,157 @@
+import subprocess
+import time
+from unittest.mock import patch
+import sys
+import os
+import json
+import requests
+from datetime import datetime
+import pytest
+import importlib.util
+import traceback
+
+
+class CaltechDataTester:
+    def __init__(self):
+        # Use GitHub Actions environment or create a local test directory
+        self.test_dir = os.environ.get(
+            "GITHUB_WORKSPACE", os.path.join(os.getcwd(), "caltech_test_data")
+        )
+        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        # Ensure test directory exists
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Create test run directory
+        self.test_run_dir = os.path.join(self.test_dir, f"test_run_{self.timestamp}")
+        os.makedirs(self.test_run_dir, exist_ok=True)
+
+        # Initialize logging
+        self.log_file = os.path.join(self.test_run_dir, "test_log.txt")
+
+    def log(self, message):
+        """Log message to both console and file"""
+        print(message)
+        with open(self.log_file, "a") as f:
+            f.write(f"{datetime.now()}: {message}\n")
+
+    def create_test_files(self):
+        """Create necessary test files"""
+        csv_path = os.path.join(self.test_run_dir, "test_data.csv")
+        with open(csv_path, "w") as f:
+            f.write("date,temperature,humidity\n")
+            f.write("2023-01-01,25.5,60\n")
+            f.write("2023-01-02,26.0,62\n")
+            f.write("2023-01-03,24.8,65\n")
+
+        self.log(f"Created test CSV file: {csv_path}")
+        return csv_path
+
+    def import_cli_module(self):
+        """Dynamically import cli module from the correct path"""
+        cli_path = os.path.join(
+            os.environ.get("GITHUB_WORKSPACE", os.getcwd()), "caltechdata_api", "cli.py"
+        )
+        spec = importlib.util.spec_from_file_location("cli", cli_path)
+        cli_module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(cli_module)
+        return cli_module
+
+    def generate_test_responses(self):
+        """Generate test responses for CLI prompts"""
+        return {
+            "Do you want to create or edit a CaltechDATA record? (create/edit): ": "create",
+            "Do you want to use metadata from an existing file or create new metadata? (existing/create): ": "create",
+            "Enter the title of the dataset: ": f"Test Dataset {self.timestamp}",
+            "Enter the abstract or description of the dataset: ": "This is an automated test dataset containing sample climate data for validation purposes.",
+            "Enter the number corresponding to the desired license: ": "1",
+            "Enter your ORCID identifier: ": os.environ.get(
+                "TEST_ORCID", "0000-0002-1825-0097"
+            ),
+            "How many funding entries do you want to provide? ": "1",
+            "Enter the award number for funding: ": "NSF-1234567",
+            "Enter the award title for funding: ": "Automated Testing Grant",
+            "Enter the funder ROR (https://ror.org): ": "021nxhr62",
+            "Do you want to upload or link data files? (upload/link/n): ": "upload",
+            "Enter the filename to upload as a supporting file (or 'n' to finish): ": "test_data.csv",
+            "Do you want to add more files? (y/n): ": "n",
+            "Do you want to send this record to CaltechDATA? (y/n): ": "y",
+        }
+
+    def run_test_submission(self):
+        """Run the complete test submission process"""
+        try:
+            self.log("Starting test submission process...")
+
+            # Create test files
+            test_csv = self.create_test_files()
+
+            # Dynamically import cli module
+            cli_module = self.import_cli_module()
+
+            # Generate responses
+            responses = self.generate_test_responses()
+
+            # Setup output capture
+            class OutputCapture:
+                def __init__(self):
+                    self.output = []
+
+                def write(self, text):
+                    self.output.append(text)
+                    sys.__stdout__.write(text)
+
+                def flush(self):
+                    pass
+
+                def get_output(self):
+                    return "".join(self.output)
+
+            output_capture = OutputCapture()
+            sys.stdout = output_capture
+
+            # Mock input and run CLI
+            def mock_input(prompt):
+                self.log(f"Prompt: {prompt}")
+                if prompt in responses:
+                    response = responses[prompt]
+                    self.log(f"Response: {response}")
+                    return response
+                return ""
+
+            with patch("builtins.input", side_effect=mock_input):
+                # Use -test flag to use test mode
+                sys.argv = [sys.argv[0], "-test"]
+                cli_module.main()
+
+            # Restore stdout
+            sys.stdout = sys.__stdout__
+
+            return True
+
+        except Exception as e:
+            self.log(f"Error in test submission: {e}")
+            traceback.print_exc()
+            return False
+        finally:
+            # Cleanup
+            if "test_csv" in locals() and os.path.exists(test_csv):
+                os.remove(test_csv)
+            self.log("Test files cleaned up")
+
+
+def main():
+    tester = CaltechDataTester()
+
+    success = tester.run_test_submission()
+
+    if success:
+        tester.log("\n🎉 Test submission completed successfully!")
+        sys.exit(0)
+    else:
+        tester.log("\n❌ Test submission failed - check logs for details")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()