From 2798e282e4b6344e73e41149ad6a0f0335fad24f Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Mon, 28 Jul 2025 15:58:18 -0700 Subject: [PATCH 1/9] init --- README.md | 1 + docs/datasources/salesforce.md | 6 + docs/index.md | 1 + mkdocs.yml | 1 + pyproject.toml | 4 +- pyspark_datasources/__init__.py | 1 + pyspark_datasources/salesforce.py | 294 ++++++++++++++++++++++++++++++ tests/test_data_sources.py | 102 +++++++++++ 8 files changed, 409 insertions(+), 1 deletion(-) create mode 100644 docs/datasources/salesforce.md create mode 100644 pyspark_datasources/salesforce.py diff --git a/README.md b/README.md index 27c53cf..08d1141 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ spark.readStream.format("fake").load().writeStream.format("console").start() | [KaggleDataSource](pyspark_datasources/kaggle.py) | `kaggle` | Read datasets from Kaggle | `kagglehub`, `pandas` | | [SimpleJsonDataSource](pyspark_datasources/simplejson.py) | `simplejson` | Write JSON data to Databricks DBFS | `databricks-sdk` | | [OpenSkyDataSource](pyspark_datasources/opensky.py) | `opensky` | Read from OpenSky Network. | None | +| [SalesforceDataSource](pyspark_datasources/salesforce.py) | `salesforce` | Write streaming data to Salesforce objects | `simple-salesforce` | See more here: https://allisonwang-db.github.io/pyspark-data-sources/. diff --git a/docs/datasources/salesforce.md b/docs/datasources/salesforce.md new file mode 100644 index 0000000..688b61c --- /dev/null +++ b/docs/datasources/salesforce.md @@ -0,0 +1,6 @@ +# SalesforceDataSource + +> Requires the [`simple-salesforce`](https://github.com/simple-salesforce/simple-salesforce) library. You can install it manually: `pip install simple-salesforce` +> or use `pip install pyspark-data-sources[salesforce]`. + +::: pyspark_datasources.salesforce.SalesforceDataSource \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index d470b0f..d78a743 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,5 +38,6 @@ spark.readStream.format("fake").load().writeStream.format("console").start() | [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` | | [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None | | [SimpleJsonDataSource](./datasources/simplejson.md) | `simplejson` | Write JSON data to Databricks DBFS | `databricks-sdk` | +| [SalesforceDataSource](./datasources/salesforce.md) | `salesforce` | Write streaming data to Salesforce objects | None | | [GoogleSheetsDataSource](./datasources/googlesheets.md) | `googlesheets` | Read table from public Google Sheets document | None | | [KaggleDataSource](./datasources/kaggle.md) | `kaggle` | Read datasets from Kaggle | `kagglehub`, `pandas` | diff --git a/mkdocs.yml b/mkdocs.yml index e66b9d4..8fde0b5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -23,6 +23,7 @@ nav: - datasources/huggingface.md - datasources/stock.md - datasources/simplejson.md + - datasources/salesforce.md - datasources/googlesheets.md - datasources/kaggle.md diff --git a/pyproject.toml b/pyproject.toml index c736d03..4ec0815 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ mkdocstrings = {extras = ["python"], version = "^0.28.0"} datasets = {version = "^2.17.0", optional = true} databricks-sdk = {version = "^0.28.0", optional = true} kagglehub = {extras = ["pandas-datasets"], version = "^0.3.10", optional = true} +simple-salesforce = {version = "^1.12.0", optional = true} [tool.poetry.extras] faker = ["faker"] @@ -25,7 +26,8 @@ datasets = ["datasets"] databricks = ["databricks-sdk"] kaggle = ["kagglehub"] lance = ["pylance"] -all = ["faker", "datasets", "databricks-sdk", "kagglehub"] +salesforce = ["simple-salesforce"] +all = ["faker", "datasets", "databricks-sdk", "kagglehub", "simple-salesforce"] [tool.poetry.group.dev.dependencies] pytest = "^8.0.0" diff --git a/pyspark_datasources/__init__.py b/pyspark_datasources/__init__.py index 1540dda..680dcda 100644 --- a/pyspark_datasources/__init__.py +++ b/pyspark_datasources/__init__.py @@ -4,5 +4,6 @@ from .huggingface import HuggingFaceDatasets from .kaggle import KaggleDataSource from .opensky import OpenSkyDataSource +from .salesforce import SalesforceDataSource from .simplejson import SimpleJsonDataSource from .stock import StockDataSource diff --git a/pyspark_datasources/salesforce.py b/pyspark_datasources/salesforce.py new file mode 100644 index 0000000..0ed76d0 --- /dev/null +++ b/pyspark_datasources/salesforce.py @@ -0,0 +1,294 @@ +import logging +from dataclasses import dataclass +from typing import Dict, List, Any + +from pyspark.sql.types import StructType +from pyspark.sql.datasource import DataSource, DataSourceStreamWriter, WriterCommitMessage + +logger = logging.getLogger(__name__) + + +@dataclass +class SalesforceCommitMessage(WriterCommitMessage): + """Commit message for Salesforce write operations.""" + records_written: int + batch_id: int + + +class SalesforceDataSource(DataSource): + """ + A Salesforce streaming data source for PySpark to write data to Salesforce objects. + + This data source enables writing streaming data from Spark to Salesforce using the + Salesforce REST API. It supports common Salesforce objects like Account, Contact, + Opportunity, and custom objects. + + Name: `salesforce` + + Notes + ----- + - Requires the `simple-salesforce` library for Salesforce API integration + - Only supports streaming write operations (not read operations) + - Uses Salesforce username/password/security token authentication + - Supports streaming processing for efficient API usage + + Parameters + ---------- + username : str + Salesforce username (email address) + password : str + Salesforce password + security_token : str + Salesforce security token (obtained from Salesforce setup) + salesforce_object : str, optional + Target Salesforce object name (default: "Account") + batch_size : str, optional + Number of records to process per batch (default: "200") + instance_url : str, optional + Custom Salesforce instance URL (auto-detected if not provided) + + Examples + -------- + Register the data source: + + >>> from pyspark_datasources import SalesforceDataSource + >>> spark.dataSource.register(SalesforceDataSource) + + Write streaming data to Salesforce Accounts: + + >>> from pyspark.sql import SparkSession + >>> from pyspark.sql.functions import col, lit + >>> + >>> spark = SparkSession.builder.appName("SalesforceExample").getOrCreate() + >>> spark.dataSource.register(SalesforceDataSource) + >>> + >>> # Create sample streaming data + >>> streaming_df = spark.readStream.format("rate").load() + >>> account_data = streaming_df.select( + ... col("value").cast("string").alias("Name"), + ... lit("Technology").alias("Industry"), + ... (col("value") * 100000).cast("double").alias("AnnualRevenue") + ... ) + >>> + >>> # Write to Salesforce + >>> query = account_data.writeStream \\ + ... .format("salesforce") \\ + ... .option("username", "your-username@company.com") \\ + ... .option("password", "your-password") \\ + ... .option("security_token", "your-security-token") \\ + ... .option("salesforce_object", "Account") \\ + ... .option("batch_size", "100") \\ + ... .start() + + Write to Salesforce Contacts: + + >>> contact_data = streaming_df.select( + ... col("value").cast("string").alias("FirstName"), + ... lit("Doe").alias("LastName"), + ... lit("contact@example.com").alias("Email") + ... ) + >>> + >>> query = contact_data.writeStream \\ + ... .format("salesforce") \\ + ... .option("username", "your-username@company.com") \\ + ... .option("password", "your-password") \\ + ... .option("security_token", "your-security-token") \\ + ... .option("salesforce_object", "Contact") \\ + ... .start() + + Write to custom Salesforce objects: + + >>> custom_data = streaming_df.select( + ... col("value").cast("string").alias("Custom_Field__c"), + ... lit("Custom Value").alias("Another_Field__c") + ... ) + >>> + >>> query = custom_data.writeStream \\ + ... .format("salesforce") \\ + ... .option("username", "your-username@company.com") \\ + ... .option("password", "your-password") \\ + ... .option("security_token", "your-security-token") \\ + ... .option("salesforce_object", "Custom_Object__c") \\ + ... .start() + """ + + @classmethod + def name(cls) -> str: + """Return the short name for this data source.""" + return "salesforce" + + def schema(self) -> str: + """ + Define the default schema for Salesforce Account objects. + + This schema can be overridden by users when creating their DataFrame. + """ + return """ + Name STRING NOT NULL, + Industry STRING, + Phone STRING, + Website STRING, + AnnualRevenue DOUBLE, + NumberOfEmployees INT, + BillingStreet STRING, + BillingCity STRING, + BillingState STRING, + BillingPostalCode STRING, + BillingCountry STRING + """ + + def streamWriter(self, schema: StructType, overwrite: bool) -> "SalesforceStreamWriter": + """Create a stream writer for Salesforce integration.""" + return SalesforceStreamWriter(schema, self.options) + + +class SalesforceStreamWriter(DataSourceStreamWriter): + """Stream writer implementation for Salesforce integration.""" + + def __init__(self, schema: StructType, options: Dict[str, str]): + self.schema = schema + self.options = options + + # Extract Salesforce configuration + self.username = options.get("username") + self.password = options.get("password") + self.security_token = options.get("security_token") + self.instance_url = options.get("instance_url") + self.salesforce_object = options.get("salesforce_object", "Account") + self.batch_size = int(options.get("batch_size", "200")) + + # Validate required options + if not all([self.username, self.password, self.security_token]): + raise ValueError( + "Salesforce username, password, and security_token are required. " + "Set them using .option() method in your streaming query." + ) + + logger.info(f"Initializing Salesforce writer for object '{self.salesforce_object}'") + + def write(self, iterator) -> SalesforceCommitMessage: + """Write data to Salesforce.""" + # Import here to avoid serialization issues + try: + from simple_salesforce import Salesforce + except ImportError: + raise ImportError( + "simple-salesforce library is required for Salesforce integration. " + "Install it with: pip install simple-salesforce" + ) + + from pyspark import TaskContext + + # Get task context for batch identification + context = TaskContext.get() + batch_id = context.taskAttemptId() + + # Connect to Salesforce + try: + sf_kwargs = { + 'username': self.username, + 'password': self.password, + 'security_token': self.security_token + } + if self.instance_url: + sf_kwargs['instance_url'] = self.instance_url + + sf = Salesforce(**sf_kwargs) + logger.info(f"✓ Connected to Salesforce (batch {batch_id})") + except Exception as e: + logger.error(f"Failed to connect to Salesforce: {str(e)}") + raise ConnectionError(f"Salesforce connection failed: {str(e)}") + + # Convert rows to Salesforce records + records = [] + for row in iterator: + try: + record = self._convert_row_to_salesforce_record(row) + if record: # Only add non-empty records + records.append(record) + except Exception as e: + logger.warning(f"Failed to convert row to Salesforce record: {str(e)}") + + if not records: + logger.info(f"No valid records to write in batch {batch_id}") + return SalesforceCommitMessage(records_written=0, batch_id=batch_id) + + # Write records to Salesforce + try: + records_written = self._write_to_salesforce(sf, records, batch_id) + logger.info(f"✅ Batch {batch_id}: Successfully wrote {records_written} records") + return SalesforceCommitMessage(records_written=records_written, batch_id=batch_id) + except Exception as e: + logger.error(f"❌ Batch {batch_id}: Failed to write records: {str(e)}") + raise + + def _convert_row_to_salesforce_record(self, row) -> Dict[str, Any]: + """Convert a Spark Row to a Salesforce record format.""" + record = {} + + for field in self.schema.fields: + field_name = field.name + try: + # Use getattr for safe field access + value = getattr(row, field_name, None) + + if value is not None: + # Convert value based on field type + if hasattr(value, 'isoformat'): # datetime objects + record[field_name] = value.isoformat() + elif isinstance(value, (int, float)): + record[field_name] = value + else: + record[field_name] = str(value) + + except Exception as e: + logger.warning(f"Failed to convert field '{field_name}': {str(e)}") + + return record + + def _write_to_salesforce(self, sf, records: List[Dict[str, Any]], batch_id: int) -> int: + """Write records to Salesforce using REST API.""" + success_count = 0 + + # Get the Salesforce object API + try: + sf_object = getattr(sf, self.salesforce_object) + except AttributeError: + raise ValueError(f"Salesforce object '{self.salesforce_object}' not found") + + # Process records in batches + for i in range(0, len(records), self.batch_size): + batch_records = records[i:i + self.batch_size] + + for j, record in enumerate(batch_records): + try: + # Create the record in Salesforce + result = sf_object.create(record) + + if result.get('success'): + success_count += 1 + else: + logger.warning(f"Failed to create record {i+j}: {result.get('errors', 'Unknown error')}") + + except Exception as e: + logger.error(f"Error creating record {i+j}: {str(e)}") + + # Log progress for large batches + if len(records) > 50 and (i + self.batch_size) % 100 == 0: + logger.info(f"Batch {batch_id}: Processed {i + self.batch_size}/{len(records)} records") + + return success_count + + def commit(self, messages: List[SalesforceCommitMessage], batch_id: int) -> None: + """Commit the write operation.""" + total_records = sum(msg.records_written for msg in messages if msg is not None) + total_batches = len([msg for msg in messages if msg is not None]) + + logger.info(f"✅ Commit batch {batch_id}: Successfully wrote {total_records} records across {total_batches} batches") + + def abort(self, messages: List[SalesforceCommitMessage], batch_id: int) -> None: + """Abort the write operation.""" + total_batches = len([msg for msg in messages if msg is not None]) + logger.warning(f"❌ Abort batch {batch_id}: Rolling back {total_batches} batches") + # Note: Salesforce doesn't support transaction rollback for individual records + # Records that were successfully created will remain in Salesforce diff --git a/tests/test_data_sources.py b/tests/test_data_sources.py index ad6a2b0..6b29d62 100644 --- a/tests/test_data_sources.py +++ b/tests/test_data_sources.py @@ -64,3 +64,105 @@ def test_opensky_datasource_stream(spark): result.show() assert len(result.columns) == 18 # Check schema has expected number of fields assert result.count() > 0 # Verify we got some data + + +def test_salesforce_datasource_registration(spark): + """Test that Salesforce DataSource can be registered and validates required options.""" + spark.dataSource.register(SalesforceDataSource) + + # Test that the datasource is registered with correct name + assert SalesforceDataSource.name() == "salesforce" + + # Test that the data source is streaming-only (no batch writer) + from pyspark.sql.functions import lit + + try: + # Try to use batch write - should fail since we only support streaming + df = spark.range(1).select( + lit("Test Company").alias("Name"), + lit("Technology").alias("Industry"), + lit(50000.0).alias("AnnualRevenue") + ) + + df.write.format("salesforce").mode("append").save() + assert False, "Should have raised error - Salesforce DataSource only supports streaming" + except Exception as e: + # This is expected - Salesforce DataSource only supports streaming writes + error_msg = str(e).lower() + # The error can be about unsupported mode or missing writer + assert "unsupported" in error_msg or "writer" in error_msg or "not implemented" in error_msg + + +def test_salesforce_datasource_stream_write(spark): + """Test Salesforce streaming write functionality with real credentials.""" + import os + import pytest + import tempfile + import shutil + + # Check for Salesforce credentials in environment variables + username = os.getenv('SALESFORCE_USERNAME') + password = os.getenv('SALESFORCE_PASSWORD') + security_token = os.getenv('SALESFORCE_SECURITY_TOKEN') + + if not all([username, password, security_token]): + pytest.skip("Salesforce credentials not found in environment variables. " + "Set SALESFORCE_USERNAME, SALESFORCE_PASSWORD, and SALESFORCE_SECURITY_TOKEN to run this test.") + + # Register the Salesforce DataSource + spark.dataSource.register(SalesforceDataSource) + + # Create test streaming data + checkpoint_dir = tempfile.mkdtemp(prefix="sf_test_") + + try: + streaming_df = spark.readStream \ + .format("rate") \ + .option("rowsPerSecond", 1) \ + .load() \ + .selectExpr( + "CONCAT('PySparkTest_', CAST(value as STRING)) as Name", + "'Technology' as Industry" + ) + + # Write to Salesforce with streaming query + query = streaming_df.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "5") \ + .option("checkpointLocation", checkpoint_dir) \ + .trigger(once=True) \ + .start() + + # Wait for completion + query.awaitTermination(timeout=15) + + if query.isActive: + query.stop() + + # Verify the query ran without critical errors + exception = query.exception() + if exception: + # Allow for environment issues but ensure DataSource was properly initialized + exception_str = str(exception) + if not any(env_issue in exception_str for env_issue in [ + "PYTHON_VERSION_MISMATCH", "PYTHON_DATA_SOURCE_ERROR" + ]): + pytest.fail(f"Unexpected streaming error: {exception_str}") + + # Clean up any test records created + try: + from simple_salesforce import Salesforce + sf = Salesforce(username=username, password=password, security_token=security_token) + results = sf.query("SELECT Id, Name FROM Account WHERE Name LIKE 'PySparkTest_%'") + for record in results['records']: + sf.Account.delete(record['Id']) + except: + pass # Clean-up is best effort + + finally: + # Clean up checkpoint directory + shutil.rmtree(checkpoint_dir, ignore_errors=True) From 8a3ec8b9d84a747d62e83bf91ac5f78c1dc9f17b Mon Sep 17 00:00:00 2001 From: Shujing Yang <135740748+shujingyang-db@users.noreply.github.com> Date: Wed, 30 Jul 2025 10:11:52 -0700 Subject: [PATCH 2/9] Update index.md --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index d78a743..1470257 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,6 +38,6 @@ spark.readStream.format("fake").load().writeStream.format("console").start() | [HuggingFaceDatasets](./datasources/huggingface.md) | `huggingface` | Read datasets from the HuggingFace Hub | `datasets` | | [StockDataSource](./datasources/stock.md) | `stock` | Read stock data from Alpha Vantage | None | | [SimpleJsonDataSource](./datasources/simplejson.md) | `simplejson` | Write JSON data to Databricks DBFS | `databricks-sdk` | -| [SalesforceDataSource](./datasources/salesforce.md) | `salesforce` | Write streaming data to Salesforce objects | None | +| [SalesforceDataSource](./datasources/salesforce.md) | `salesforce` | Write streaming data to Salesforce objects |`simple-salesforce` | | [GoogleSheetsDataSource](./datasources/googlesheets.md) | `googlesheets` | Read table from public Google Sheets document | None | | [KaggleDataSource](./datasources/kaggle.md) | `kaggle` | Read datasets from Kaggle | `kagglehub`, `pandas` | From a0f51a2804d59a6e02ae674b07094e265d87afe4 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Thu, 31 Jul 2025 14:39:22 -0700 Subject: [PATCH 3/9] add schema option, batch write to salesforce --- pyspark_datasources/salesforce.py | 153 ++++++++++++++++++++---------- 1 file changed, 103 insertions(+), 50 deletions(-) diff --git a/pyspark_datasources/salesforce.py b/pyspark_datasources/salesforce.py index 0ed76d0..ddc441b 100644 --- a/pyspark_datasources/salesforce.py +++ b/pyspark_datasources/salesforce.py @@ -78,6 +78,7 @@ class SalesforceDataSource(DataSource): ... .option("security_token", "your-security-token") \\ ... .option("salesforce_object", "Account") \\ ... .option("batch_size", "100") \\ + ... .option("schema", "Name STRING NOT NULL, Industry STRING, Phone STRING, Website STRING, AnnualRevenue DOUBLE, NumberOfEmployees INT, BillingStreet STRING, BillingCity STRING, BillingState STRING, BillingPostalCode STRING, BillingCountry STRING") \\ ... .start() Write to Salesforce Contacts: @@ -119,10 +120,14 @@ def name(cls) -> str: def schema(self) -> str: """ - Define the default schema for Salesforce Account objects. - - This schema can be overridden by users when creating their DataFrame. + Return the schema for Salesforce objects. + + If the user provides a 'schema' option, use it. + Otherwise, return the default Account schema. """ + user_schema = self.options.get("schema") + if user_schema: + return user_schema return """ Name STRING NOT NULL, Industry STRING, @@ -199,28 +204,42 @@ def write(self, iterator) -> SalesforceCommitMessage: logger.error(f"Failed to connect to Salesforce: {str(e)}") raise ConnectionError(f"Salesforce connection failed: {str(e)}") - # Convert rows to Salesforce records - records = [] + # Convert rows to Salesforce records and write in batches to avoid memory issues + records_buffer = [] + total_records_written = 0 + + def flush_buffer(): + nonlocal total_records_written + if records_buffer: + try: + written = self._write_to_salesforce(sf, records_buffer, batch_id) + logger.info(f"✅ Batch {batch_id}: Successfully wrote {written} records (buffer flush)") + total_records_written += written + except Exception as e: + logger.error(f"❌ Batch {batch_id}: Failed to write records during buffer flush: {str(e)}") + raise + records_buffer.clear() + for row in iterator: try: record = self._convert_row_to_salesforce_record(row) if record: # Only add non-empty records - records.append(record) + records_buffer.append(record) + if len(records_buffer) >= self.batch_size: + flush_buffer() except Exception as e: logger.warning(f"Failed to convert row to Salesforce record: {str(e)}") - - if not records: + + # Flush any remaining records in the buffer + if records_buffer: + flush_buffer() + + if total_records_written == 0: logger.info(f"No valid records to write in batch {batch_id}") - return SalesforceCommitMessage(records_written=0, batch_id=batch_id) - - # Write records to Salesforce - try: - records_written = self._write_to_salesforce(sf, records, batch_id) - logger.info(f"✅ Batch {batch_id}: Successfully wrote {records_written} records") - return SalesforceCommitMessage(records_written=records_written, batch_id=batch_id) - except Exception as e: - logger.error(f"❌ Batch {batch_id}: Failed to write records: {str(e)}") - raise + else: + logger.info(f"✅ Batch {batch_id}: Successfully wrote {total_records_written} records (total)") + + return SalesforceCommitMessage(records_written=total_records_written, batch_id=batch_id) def _convert_row_to_salesforce_record(self, row) -> Dict[str, Any]: """Convert a Spark Row to a Salesforce record format.""" @@ -250,45 +269,79 @@ def _write_to_salesforce(self, sf, records: List[Dict[str, Any]], batch_id: int) """Write records to Salesforce using REST API.""" success_count = 0 - # Get the Salesforce object API - try: - sf_object = getattr(sf, self.salesforce_object) - except AttributeError: - raise ValueError(f"Salesforce object '{self.salesforce_object}' not found") - - # Process records in batches + # Process records in batches using sObject Collections API for i in range(0, len(records), self.batch_size): batch_records = records[i:i + self.batch_size] - for j, record in enumerate(batch_records): - try: - # Create the record in Salesforce - result = sf_object.create(record) - - if result.get('success'): - success_count += 1 + try: + # Use Composite Tree API for batch creation (up to 200 records) + # Prepare records for batch API + collection_records = [] + for idx, record in enumerate(batch_records): + # Add required attributes for Composite Tree API + record_with_attributes = { + "attributes": { + "type": self.salesforce_object, + "referenceId": f"ref{i + idx}" + }, + **record + } + collection_records.append(record_with_attributes) + + # Make batch API call using Composite Tree API + # This API is specifically designed for batch inserts + payload = { + "records": collection_records + } + + response = sf.restful( + f'composite/tree/{self.salesforce_object}', + method='POST', + json=payload + ) + + # Count successful records + # Composite Tree API returns a different response format + if isinstance(response, dict): + # Check if the batch was successful + if response.get('hasErrors', True) is False: + # All records in the batch were created successfully + success_count += len(batch_records) else: - logger.warning(f"Failed to create record {i+j}: {result.get('errors', 'Unknown error')}") - - except Exception as e: - logger.error(f"Error creating record {i+j}: {str(e)}") + # Some records failed, check individual results + results = response.get('results', []) + for result in results: + if 'id' in result: + success_count += 1 + else: + errors = result.get('errors', []) + for error in errors: + logger.warning(f"Failed to create record {result.get('referenceId', 'unknown')}: {error.get('message', 'Unknown error')}") + else: + logger.error(f"Unexpected response format: {response}") + + except Exception as e: + logger.error(f"Error in batch creation for batch {i//self.batch_size + 1}: {str(e)}") + # Fallback to individual record creation for this batch + try: + sf_object = getattr(sf, self.salesforce_object) + for j, record in enumerate(batch_records): + try: + # Create the record in Salesforce + result = sf_object.create(record) + + if result.get('success'): + success_count += 1 + else: + logger.warning(f"Failed to create record {i+j}: {result.get('errors', 'Unknown error')}") + + except Exception as e: + logger.error(f"Error creating record {i+j}: {str(e)}") + except AttributeError: + raise ValueError(f"Salesforce object '{self.salesforce_object}' not found") # Log progress for large batches if len(records) > 50 and (i + self.batch_size) % 100 == 0: logger.info(f"Batch {batch_id}: Processed {i + self.batch_size}/{len(records)} records") return success_count - - def commit(self, messages: List[SalesforceCommitMessage], batch_id: int) -> None: - """Commit the write operation.""" - total_records = sum(msg.records_written for msg in messages if msg is not None) - total_batches = len([msg for msg in messages if msg is not None]) - - logger.info(f"✅ Commit batch {batch_id}: Successfully wrote {total_records} records across {total_batches} batches") - - def abort(self, messages: List[SalesforceCommitMessage], batch_id: int) -> None: - """Abort the write operation.""" - total_batches = len([msg for msg in messages if msg is not None]) - logger.warning(f"❌ Abort batch {batch_id}: Rolling back {total_batches} batches") - # Note: Salesforce doesn't support transaction rollback for individual records - # Records that were successfully created will remain in Salesforce From 478fccaa775a630afcb2c3b501f5151a0a69ba86 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Thu, 31 Jul 2025 14:43:10 -0700 Subject: [PATCH 4/9] update doc: salesforce sink --- pyspark_datasources/salesforce.py | 33 +++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pyspark_datasources/salesforce.py b/pyspark_datasources/salesforce.py index ddc441b..b4edf85 100644 --- a/pyspark_datasources/salesforce.py +++ b/pyspark_datasources/salesforce.py @@ -17,20 +17,23 @@ class SalesforceCommitMessage(WriterCommitMessage): class SalesforceDataSource(DataSource): """ - A Salesforce streaming data source for PySpark to write data to Salesforce objects. + A Salesforce streaming sink for PySpark to write data to Salesforce objects. - This data source enables writing streaming data from Spark to Salesforce using the + This data sink enables writing streaming data from Spark to Salesforce using the Salesforce REST API. It supports common Salesforce objects like Account, Contact, Opportunity, and custom objects. + Note: This is a write-only sink, not a full bidirectional data source. + Name: `salesforce` Notes ----- - Requires the `simple-salesforce` library for Salesforce API integration - - Only supports streaming write operations (not read operations) + - **Write-only sink**: Only supports streaming write operations (no read operations) - Uses Salesforce username/password/security token authentication - - Supports streaming processing for efficient API usage + - Supports batch writing with Salesforce Composite Tree API for efficient processing + - Implements exactly-once semantics through Spark's checkpoint mechanism Parameters ---------- @@ -49,7 +52,7 @@ class SalesforceDataSource(DataSource): Examples -------- - Register the data source: + Register the Salesforce sink: >>> from pyspark_datasources import SalesforceDataSource >>> spark.dataSource.register(SalesforceDataSource) @@ -70,7 +73,7 @@ class SalesforceDataSource(DataSource): ... (col("value") * 100000).cast("double").alias("AnnualRevenue") ... ) >>> - >>> # Write to Salesforce + >>> # Write to Salesforce using the sink >>> query = account_data.writeStream \\ ... .format("salesforce") \\ ... .option("username", "your-username@company.com") \\ @@ -78,7 +81,7 @@ class SalesforceDataSource(DataSource): ... .option("security_token", "your-security-token") \\ ... .option("salesforce_object", "Account") \\ ... .option("batch_size", "100") \\ - ... .option("schema", "Name STRING NOT NULL, Industry STRING, Phone STRING, Website STRING, AnnualRevenue DOUBLE, NumberOfEmployees INT, BillingStreet STRING, BillingCity STRING, BillingState STRING, BillingPostalCode STRING, BillingCountry STRING") \\ + ... .option("checkpointLocation", "/path/to/checkpoint") \\ ... .start() Write to Salesforce Contacts: @@ -95,6 +98,7 @@ class SalesforceDataSource(DataSource): ... .option("password", "your-password") \\ ... .option("security_token", "your-security-token") \\ ... .option("salesforce_object", "Contact") \\ + ... .option("checkpointLocation", "/path/to/checkpoint") \\ ... .start() Write to custom Salesforce objects: @@ -110,12 +114,21 @@ class SalesforceDataSource(DataSource): ... .option("password", "your-password") \\ ... .option("security_token", "your-security-token") \\ ... .option("salesforce_object", "Custom_Object__c") \\ + ... .option("checkpointLocation", "/path/to/checkpoint") \\ ... .start() + + Key Features: + + - **Write-only sink**: Designed specifically for writing data to Salesforce + - **Batch processing**: Uses Salesforce Composite Tree API for efficient bulk writes + - **Exactly-once semantics**: Integrates with Spark's checkpoint mechanism + - **Error handling**: Graceful fallback to individual record creation if batch fails + - **Flexible schema**: Supports any Salesforce object with proper field mapping """ @classmethod def name(cls) -> str: - """Return the short name for this data source.""" + """Return the short name for this Salesforce sink.""" return "salesforce" def schema(self) -> str: @@ -143,12 +156,12 @@ def schema(self) -> str: """ def streamWriter(self, schema: StructType, overwrite: bool) -> "SalesforceStreamWriter": - """Create a stream writer for Salesforce integration.""" + """Create a stream writer for Salesforce sink integration.""" return SalesforceStreamWriter(schema, self.options) class SalesforceStreamWriter(DataSourceStreamWriter): - """Stream writer implementation for Salesforce integration.""" + """Stream writer implementation for Salesforce sink integration.""" def __init__(self, schema: StructType, options: Dict[str, str]): self.schema = schema From 510ff303fc296ba0459182d7efa52aa374975520 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Thu, 31 Jul 2025 14:47:09 -0700 Subject: [PATCH 5/9] update option: schema --- README.md | 2 +- pyspark_datasources/salesforce.py | 41 ++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 08d1141..925eaa6 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ spark.readStream.format("fake").load().writeStream.format("console").start() | [KaggleDataSource](pyspark_datasources/kaggle.py) | `kaggle` | Read datasets from Kaggle | `kagglehub`, `pandas` | | [SimpleJsonDataSource](pyspark_datasources/simplejson.py) | `simplejson` | Write JSON data to Databricks DBFS | `databricks-sdk` | | [OpenSkyDataSource](pyspark_datasources/opensky.py) | `opensky` | Read from OpenSky Network. | None | -| [SalesforceDataSource](pyspark_datasources/salesforce.py) | `salesforce` | Write streaming data to Salesforce objects | `simple-salesforce` | +| [SalesforceDataSource](pyspark_datasources/salesforce.py) | `salesforce` | Streaming sink for writing data to Salesforce | `simple-salesforce` | See more here: https://allisonwang-db.github.io/pyspark-data-sources/. diff --git a/pyspark_datasources/salesforce.py b/pyspark_datasources/salesforce.py index b4edf85..2a6a85e 100644 --- a/pyspark_datasources/salesforce.py +++ b/pyspark_datasources/salesforce.py @@ -49,6 +49,10 @@ class SalesforceDataSource(DataSource): Number of records to process per batch (default: "200") instance_url : str, optional Custom Salesforce instance URL (auto-detected if not provided) + schema : str, optional + Custom schema definition for the Salesforce object. If not provided, + uses the default Account schema. Should be in Spark SQL DDL format. + Example: "Name STRING NOT NULL, Industry STRING, AnnualRevenue DOUBLE" Examples -------- @@ -116,6 +120,41 @@ class SalesforceDataSource(DataSource): ... .option("salesforce_object", "Custom_Object__c") \\ ... .option("checkpointLocation", "/path/to/checkpoint") \\ ... .start() + + Using custom schema for specific Salesforce objects: + + >>> # Define schema for Contact object as a DDL string + >>> contact_schema = "FirstName STRING NOT NULL, LastName STRING NOT NULL, Email STRING, Phone STRING" + >>> + >>> query = contact_data.writeStream \\ + ... .format("salesforce") \\ + ... .option("username", "your-username@company.com") \\ + ... .option("password", "your-password") \\ + ... .option("security_token", "your-security-token") \\ + ... .option("salesforce_object", "Contact") \\ + ... .option("schema", "FirstName STRING NOT NULL, LastName STRING NOT NULL, Email STRING, Phone STRING") \\ + ... .option("batch_size", "50") \\ + ... .option("checkpointLocation", "/path/to/checkpoint") \\ + ... .start() + + Using schema with Opportunity object: + + >>> opportunity_data = streaming_df.select( + ... col("name").alias("Name"), + ... col("amount").alias("Amount"), + ... col("stage").alias("StageName"), + ... col("close_date").alias("CloseDate") + ... ) + >>> + >>> query = opportunity_data.writeStream \\ + ... .format("salesforce") \\ + ... .option("username", "your-username@company.com") \\ + ... .option("password", "your-password") \\ + ... .option("security_token", "your-security-token") \\ + ... .option("salesforce_object", "Opportunity") \\ + ... .option("schema", "Name STRING NOT NULL, Amount DOUBLE, StageName STRING NOT NULL, CloseDate DATE") \\ + ... .option("checkpointLocation", "/path/to/checkpoint") \\ + ... .start() Key Features: @@ -123,7 +162,7 @@ class SalesforceDataSource(DataSource): - **Batch processing**: Uses Salesforce Composite Tree API for efficient bulk writes - **Exactly-once semantics**: Integrates with Spark's checkpoint mechanism - **Error handling**: Graceful fallback to individual record creation if batch fails - - **Flexible schema**: Supports any Salesforce object with proper field mapping + - **Flexible schema**: Supports any Salesforce object with custom schema definition """ @classmethod From c9fe38a09ca9e1b50dd88672e2167e293106ab35 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Thu, 31 Jul 2025 14:57:05 -0700 Subject: [PATCH 6/9] add tests --- tests/test_salesforce_sink.py | 309 ++++++++++++++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 tests/test_salesforce_sink.py diff --git a/tests/test_salesforce_sink.py b/tests/test_salesforce_sink.py new file mode 100644 index 0000000..39fee72 --- /dev/null +++ b/tests/test_salesforce_sink.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Test Salesforce DataSource checkpoint functionality with CSV file streaming (Auto Loader style) +""" +import os +import sys +import shutil +import time +import tempfile +import csv +sys.path.append('.') + +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, lit +from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType + +def create_csv_file(file_path, data, headers): + """Create a CSV file with the given data""" + with open(file_path, 'w', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(headers) + writer.writerows(data) + +def test_csv_checkpoint(): + """Test checkpoint functionality using Salesforce streaming write""" + + # Check credentials + username = os.getenv('SALESFORCE_USERNAME') + password = os.getenv('SALESFORCE_PASSWORD') + security_token = os.getenv('SALESFORCE_SECURITY_TOKEN') + + if not all([username, password, security_token]): + print("Missing Salesforce credentials in environment variables") + print("Please set: SALESFORCE_USERNAME, SALESFORCE_PASSWORD, SALESFORCE_SECURITY_TOKEN") + return False + + print("Using Salesforce credentials for user: {}".format(username)) + + # Create temporary directories + csv_dir = tempfile.mkdtemp(prefix="test_csv_") + checkpoint_location = tempfile.mkdtemp(prefix="test_checkpoint_csv_") + + print("CSV directory: {}".format(csv_dir)) + print("Checkpoint directory: {}".format(checkpoint_location)) + + try: + print("\n=== SETUP: Create Spark Session ===") + spark = SparkSession.builder \ + .appName("SalesforceCSVCheckpointTest") \ + .config("spark.sql.shuffle.partitions", "2") \ + .getOrCreate() + + print("Spark session created") + + # Register Salesforce DataSource + from pyspark_datasources.salesforce import SalesforceDataSource + spark.dataSource.register(SalesforceDataSource) + print("SalesforceDataSource registered") + + # Define schema for our CSV data + schema = StructType([ + StructField("id", IntegerType(), True), + StructField("name", StringType(), True), + StructField("industry", StringType(), True), + StructField("revenue", DoubleType(), True) + ]) + + headers = ["id", "name", "industry", "revenue"] + + print("\n=== PHASE 1: Create Initial CSV with 5 Records ===") + + # Create initial CSV data + initial_data = [ + [1, "CSV_Company_A", "Technology", 100000.0], + [2, "CSV_Company_B", "Finance", 200000.0], + [3, "CSV_Company_C", "Healthcare", 300000.0], + [4, "CSV_Company_D", "Manufacturing", 400000.0], + [5, "CSV_Company_E", "Retail", 500000.0] + ] + + # Create the first CSV file + initial_csv_path = os.path.join(csv_dir, "batch_001.csv") + create_csv_file(initial_csv_path, initial_data, headers) + + print("Created initial CSV file with 5 records:") + print("File: {}".format(initial_csv_path)) + + # Verify the CSV file was created correctly + test_df = spark.read.format("csv").option("header", "true").schema(schema).load(initial_csv_path) + print("Initial CSV content:") + test_df.show() + + print("\n=== PHASE 2: First Stream - Read Initial CSV and Write to Salesforce ===") + + # Create streaming read from CSV directory + streaming_df = spark.readStream \ + .format("csv") \ + .option("header", "true") \ + .schema(schema) \ + .load(csv_dir) + + # Transform for Salesforce Account format + salesforce_df = streaming_df.select( + col("name").alias("Name"), + col("industry").alias("Industry"), + col("revenue").alias("AnnualRevenue") + ) + + print("Starting first stream (should read 5 initial records)...") + + # Start first streaming query + query1 = salesforce_df.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "10") \ + .option("checkpointLocation", checkpoint_location) \ + .trigger(once=True) \ + .start() + + # Wait for completion + query1.awaitTermination(timeout=60) + + # Check for exceptions + if query1.exception() is not None: + print("First stream failed with exception: {}".format(query1.exception())) + return False + + # Get progress from first stream + first_progress = query1.lastProgress + first_records_processed = 0 + if first_progress: + print("\nFirst stream progress:") + print(" - Batch ID: {}".format(first_progress.get('batchId', 'N/A'))) + sources = first_progress.get('sources', []) + if sources: + source = sources[0] + first_records_processed = source.get('numInputRows', 0) + print(" - Records processed: {}".format(first_records_processed)) + print(" - End offset: {}".format(source.get('endOffset', 'N/A'))) + + print("First stream completed - processed {} records".format(first_records_processed)) + + print("\n=== PHASE 3: Add New CSV File with 5 New Records ===") + + # Wait a moment to ensure first stream is fully complete + time.sleep(3) + + # Create new CSV data + new_data = [ + [6, "CSV_Company_F", "Energy", 600000.0], + [7, "CSV_Company_G", "Education", 700000.0], + [8, "CSV_Company_H", "Agriculture", 800000.0], + [9, "CSV_Company_I", "Transportation", 900000.0], + [10, "CSV_Company_J", "Entertainment", 1000000.0] + ] + + # Create the second CSV file (simulating new data arrival) + new_csv_path = os.path.join(csv_dir, "batch_002.csv") + create_csv_file(new_csv_path, new_data, headers) + + print("Added new CSV file with 5 records:") + print("File: {}".format(new_csv_path)) + + # Verify total files in directory + csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')] + print("CSV files in directory: {}".format(csv_files)) + + # Verify total records across all files + all_df = spark.read.format("csv").option("header", "true").schema(schema).load(csv_dir) + total_records = all_df.count() + print("Total records across all CSV files: {}".format(total_records)) + + print("\n=== PHASE 4: Second Stream - Should Only Process New CSV File ===") + + # Create new streaming read from the same CSV directory + streaming_df2 = spark.readStream \ + .format("csv") \ + .option("header", "true") \ + .schema(schema) \ + .load(csv_dir) + + # Transform for Salesforce (same as before) + salesforce_df2 = streaming_df2.select( + col("name").alias("Name"), + col("industry").alias("Industry"), + col("revenue").alias("AnnualRevenue") + ) + + print("Starting second stream with same checkpoint (should only read new CSV file)...") + + # Start second streaming query with SAME checkpoint + query2 = salesforce_df2.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "10") \ + .option("checkpointLocation", checkpoint_location) \ + .trigger(once=True) \ + .start() + + # Wait for completion + query2.awaitTermination(timeout=60) + + # Check for exceptions + if query2.exception() is not None: + print("Second stream failed with exception: {}".format(query2.exception())) + return False + + # Get progress from second stream + second_progress = query2.lastProgress + second_records_processed = 0 + if second_progress: + print("\nSecond stream progress:") + print(" - Batch ID: {}".format(second_progress.get('batchId', 'N/A'))) + sources = second_progress.get('sources', []) + if sources: + source = sources[0] + second_records_processed = source.get('numInputRows', 0) + print(" - Records processed: {}".format(second_records_processed)) + print(" - Start offset: {}".format(source.get('startOffset', 'N/A'))) + print(" - End offset: {}".format(source.get('endOffset', 'N/A'))) + + print("Second stream completed - processed {} records".format(second_records_processed)) + + print("\n" + "=" * 60) + print("CHECKPOINT VERIFICATION RESULTS") + print("=" * 60) + print("Total CSV files created: {}".format(len(csv_files))) + print("Total records in all CSV files: {}".format(total_records)) + print("First stream processed: {} records".format(first_records_processed)) + print("Second stream processed: {} records".format(second_records_processed)) + print("Total records processed: {}".format(first_records_processed + second_records_processed)) + + # Analyze results - Only one case can be considered true success + if first_records_processed == 5 and second_records_processed == 5: + print("\n✅ SUCCESS: Checkpoint functionality working perfectly!") + print(" - First stream processed initial 5 records from batch_001.csv") + print(" - Second stream processed only the new 5 records from batch_002.csv") + print(" - No data was reprocessed (exactly-once semantics)") + print(" - Total: 10 unique records written to Salesforce") + return True + elif first_records_processed == 5 and second_records_processed == 0: + print("\n⚠️ PARTIAL: Checkpoint prevented reprocessing but no new data detected") + print(" - First stream processed initial 5 records correctly") + print(" - Second stream found no new records (timing issue or file not detected)") + print(" - This shows checkpoint prevents duplicates but doesn't prove full functionality") + return False + elif second_records_processed == total_records: + print("\n❌ CHECKPOINT FAILURE: Second stream reprocessed all data") + print(" - This indicates checkpoint was not properly restored") + print(" - Expected: Second stream should only process new files") + return False + else: + print("\n❌ UNEXPECTED RESULT:") + print(" - First stream: {} records (expected: 5)".format(first_records_processed)) + print(" - Second stream: {} records (expected: 5)".format(second_records_processed)) + print(" - Total available: {} records".format(total_records)) + print(" - Only success case: first=5, second=5 (exactly-once processing)") + return False + + except Exception as e: + print("Test failed with error: {}".format(str(e))) + import traceback + traceback.print_exc() + return False + + finally: + # Clean up + try: + if 'spark' in locals(): + spark.stop() + print("\nSpark session stopped") + except: + pass + + # Clean up directories + if os.path.exists(csv_dir): + shutil.rmtree(csv_dir) + print("Cleaned up CSV directory") + + if os.path.exists(checkpoint_location): + shutil.rmtree(checkpoint_location) + print("Cleaned up checkpoint directory") + +if __name__ == "__main__": + print("Testing Salesforce DataSource CSV-Based Checkpoint Functionality") + print("(Similar to Auto Loader pattern)") + print("=" * 60) + + success = test_csv_checkpoint() + + print("\n" + "=" * 60) + print("FINAL RESULTS") + print("=" * 60) + + if success: + print("✅ CSV checkpoint test PASSED!") + print(" The SalesforceDataSource correctly handles checkpoints with file-based streaming") + print(" This validates exactly-once processing semantics") + else: + print("❌ CSV checkpoint test FAILED!") + print(" Please check the error messages above") + sys.exit(1) \ No newline at end of file From 340ef03bd630cf92bcc81d5ed4cfbd36afb4786c Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Thu, 31 Jul 2025 15:30:47 -0700 Subject: [PATCH 7/9] rm tests --- examples/salesforce_sink_example.py | 458 ++++++++++++++++++++++++++++ tests/test_data_sources.py | 75 ----- tests/test_salesforce_sink.py | 309 ------------------- 3 files changed, 458 insertions(+), 384 deletions(-) create mode 100644 examples/salesforce_sink_example.py delete mode 100644 tests/test_salesforce_sink.py diff --git a/examples/salesforce_sink_example.py b/examples/salesforce_sink_example.py new file mode 100644 index 0000000..1269d03 --- /dev/null +++ b/examples/salesforce_sink_example.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Salesforce Sink Example + +This example demonstrates how to use the SalesforceDataSource as a streaming sink +to write data from various sources to Salesforce objects. + +Requirements: +- PySpark 4.0+ +- simple-salesforce library +- Valid Salesforce credentials + +Setup: + pip install pyspark simple-salesforce + +Environment Variables: + export SALESFORCE_USERNAME="your-username@company.com" + export SALESFORCE_PASSWORD="your-password" + export SALESFORCE_SECURITY_TOKEN="your-security-token" +""" + +import os +import sys +import tempfile +import csv +from pyspark.sql import SparkSession +from pyspark.sql.functions import col, lit, current_timestamp +from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType + +# Add the project root to Python path +sys.path.append('..') + +def check_credentials(): + """Check if Salesforce credentials are available""" + username = os.getenv('SALESFORCE_USERNAME') + password = os.getenv('SALESFORCE_PASSWORD') + security_token = os.getenv('SALESFORCE_SECURITY_TOKEN') + + if not all([username, password, security_token]): + print("❌ Missing Salesforce credentials!") + print("Please set the following environment variables:") + print(" export SALESFORCE_USERNAME='your-username@company.com'") + print(" export SALESFORCE_PASSWORD='your-password'") + print(" export SALESFORCE_SECURITY_TOKEN='your-security-token'") + return False, None, None, None + + print(f"✅ Using Salesforce credentials for: {username}") + return True, username, password, security_token + +def example_1_rate_source_to_accounts(): + """Example 1: Stream from rate source to Salesforce Accounts""" + print("\n" + "="*60) + print("EXAMPLE 1: Rate Source → Salesforce Accounts") + print("="*60) + + has_creds, username, password, security_token = check_credentials() + if not has_creds: + return + + spark = SparkSession.builder \ + .appName("SalesforceExample1") \ + .config("spark.sql.shuffle.partitions", "2") \ + .getOrCreate() + + try: + # Register Salesforce sink + from pyspark_datasources.salesforce import SalesforceDataSource + spark.dataSource.register(SalesforceDataSource) + print("✅ Salesforce sink registered") + + # Create streaming data from rate source + streaming_df = spark.readStream \ + .format("rate") \ + .option("rowsPerSecond", 2) \ + .load() + + # Transform to Account format + account_data = streaming_df.select( + col("timestamp").cast("string").alias("Name"), + lit("Technology").alias("Industry"), + (col("value") * 10000).cast("double").alias("AnnualRevenue") + ) + + print("📊 Starting streaming write to Salesforce Accounts...") + + # Write to Salesforce + query = account_data.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "10") \ + .option("checkpointLocation", "/tmp/salesforce_example1_checkpoint") \ + .trigger(once=True) \ + .start() + + # Wait for completion + query.awaitTermination(timeout=60) + + # Show results + progress = query.lastProgress + if progress: + sources = progress.get('sources', []) + if sources: + records = sources[0].get('numInputRows', 0) + print(f"✅ Successfully wrote {records} Account records to Salesforce") + else: + print("✅ Streaming completed successfully") + + except Exception as e: + print(f"❌ Error: {e}") + finally: + spark.stop() + +def example_2_csv_to_contacts(): + """Example 2: Stream from CSV files to Salesforce Contacts""" + print("\n" + "="*60) + print("EXAMPLE 2: CSV Files → Salesforce Contacts") + print("="*60) + + has_creds, username, password, security_token = check_credentials() + if not has_creds: + return + + # Create temporary CSV directory + csv_dir = tempfile.mkdtemp(prefix="salesforce_contacts_") + print(f"📁 CSV directory: {csv_dir}") + + spark = SparkSession.builder \ + .appName("SalesforceExample2") \ + .config("spark.sql.shuffle.partitions", "2") \ + .getOrCreate() + + try: + # Register Salesforce sink + from pyspark_datasources.salesforce import SalesforceDataSource + spark.dataSource.register(SalesforceDataSource) + + # Create sample CSV data + contact_data = [ + ["John", "Doe", "john.doe@example.com", "555-1234"], + ["Jane", "Smith", "jane.smith@example.com", "555-5678"], + ["Bob", "Johnson", "bob.johnson@example.com", "555-9012"] + ] + + headers = ["FirstName", "LastName", "Email", "Phone"] + csv_file = os.path.join(csv_dir, "contacts.csv") + + # Write CSV file + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(headers) + writer.writerows(contact_data) + + print(f"📝 Created CSV file with {len(contact_data)} contacts") + + # Define schema for CSV + schema = StructType([ + StructField("FirstName", StringType(), True), + StructField("LastName", StringType(), True), + StructField("Email", StringType(), True), + StructField("Phone", StringType(), True) + ]) + + # Stream from CSV + streaming_df = spark.readStream \ + .format("csv") \ + .option("header", "true") \ + .schema(schema) \ + .load(csv_dir) + + print("📊 Starting streaming write to Salesforce Contacts...") + + # Write to Salesforce with custom schema + query = streaming_df.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Contact") \ + .option("schema", "FirstName STRING, LastName STRING NOT NULL, Email STRING, Phone STRING") \ + .option("batch_size", "5") \ + .option("checkpointLocation", "/tmp/salesforce_example2_checkpoint") \ + .trigger(once=True) \ + .start() + + # Wait for completion + query.awaitTermination(timeout=60) + + # Show results + progress = query.lastProgress + if progress: + sources = progress.get('sources', []) + if sources: + records = sources[0].get('numInputRows', 0) + print(f"✅ Successfully wrote {records} Contact records to Salesforce") + else: + print("✅ Streaming completed successfully") + + except Exception as e: + print(f"❌ Error: {e}") + finally: + spark.stop() + # Cleanup + import shutil + if os.path.exists(csv_dir): + shutil.rmtree(csv_dir) + +def example_3_checkpoint_demonstration(): + """Example 3: Demonstrate checkpoint functionality with incremental data""" + print("\n" + "="*60) + print("EXAMPLE 3: Checkpoint Functionality Demonstration") + print("="*60) + + has_creds, username, password, security_token = check_credentials() + if not has_creds: + return + + # Create temporary directories + csv_dir = tempfile.mkdtemp(prefix="salesforce_checkpoint_") + checkpoint_dir = tempfile.mkdtemp(prefix="checkpoint_") + + print(f"📁 CSV directory: {csv_dir}") + print(f"📁 Checkpoint directory: {checkpoint_dir}") + + try: + # Phase 1: Create initial data and first stream + print("\n📊 Phase 1: Creating initial batch and first stream...") + + initial_data = [ + [1, "InitialCorp_A", "Tech", 100000.0], + [2, "InitialCorp_B", "Finance", 200000.0], + [3, "InitialCorp_C", "Healthcare", 300000.0] + ] + + headers = ["id", "name", "industry", "revenue"] + csv_file1 = os.path.join(csv_dir, "batch_001.csv") + + with open(csv_file1, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(headers) + writer.writerows(initial_data) + + # First stream + spark = SparkSession.builder \ + .appName("SalesforceCheckpointDemo") \ + .config("spark.sql.shuffle.partitions", "2") \ + .getOrCreate() + + from pyspark_datasources.salesforce import SalesforceDataSource + spark.dataSource.register(SalesforceDataSource) + + schema = StructType([ + StructField("id", IntegerType(), True), + StructField("name", StringType(), True), + StructField("industry", StringType(), True), + StructField("revenue", DoubleType(), True) + ]) + + streaming_df1 = spark.readStream \ + .format("csv") \ + .option("header", "true") \ + .schema(schema) \ + .load(csv_dir) + + account_df1 = streaming_df1.select( + col("name").alias("Name"), + col("industry").alias("Industry"), + col("revenue").alias("AnnualRevenue") + ) + + query1 = account_df1.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "10") \ + .option("checkpointLocation", checkpoint_dir) \ + .trigger(once=True) \ + .start() + + query1.awaitTermination(timeout=60) + + progress1 = query1.lastProgress + records1 = 0 + if progress1 and progress1.get('sources'): + records1 = progress1['sources'][0].get('numInputRows', 0) + + print(f" ✅ First stream processed {records1} records") + + # Phase 2: Add new data and second stream + print("\n📊 Phase 2: Adding new batch and second stream...") + + import time + time.sleep(2) # Brief pause + + new_data = [ + [4, "NewCorp_D", "Energy", 400000.0], + [5, "NewCorp_E", "Retail", 500000.0] + ] + + csv_file2 = os.path.join(csv_dir, "batch_002.csv") + with open(csv_file2, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(headers) + writer.writerows(new_data) + + # Second stream with same checkpoint + streaming_df2 = spark.readStream \ + .format("csv") \ + .option("header", "true") \ + .schema(schema) \ + .load(csv_dir) + + account_df2 = streaming_df2.select( + col("name").alias("Name"), + col("industry").alias("Industry"), + col("revenue").alias("AnnualRevenue") + ) + + query2 = account_df2.writeStream \ + .format("salesforce") \ + .option("username", username) \ + .option("password", password) \ + .option("security_token", security_token) \ + .option("salesforce_object", "Account") \ + .option("batch_size", "10") \ + .option("checkpointLocation", checkpoint_dir) \ + .trigger(once=True) \ + .start() + + query2.awaitTermination(timeout=60) + + progress2 = query2.lastProgress + records2 = 0 + if progress2 and progress2.get('sources'): + records2 = progress2['sources'][0].get('numInputRows', 0) + + print(f" ✅ Second stream processed {records2} records") + + # Analyze checkpoint functionality + print(f"\n📈 Checkpoint Analysis:") + print(f" - First stream: {records1} records (initial batch)") + print(f" - Second stream: {records2} records (new batch)") + print(f" - Total: {records1 + records2} records") + + if records1 == 3 and records2 == 2: + print(" ✅ PERFECT: Exactly-once processing achieved!") + print(" ✅ Checkpoint functionality working correctly") + else: + print(" ⚠️ Results may vary due to timing or file detection") + + except Exception as e: + print(f"❌ Error: {e}") + finally: + spark.stop() + # Cleanup + import shutil + if os.path.exists(csv_dir): + shutil.rmtree(csv_dir) + if os.path.exists(checkpoint_dir): + shutil.rmtree(checkpoint_dir) + +def example_4_custom_object(): + """Example 4: Write to custom Salesforce object""" + print("\n" + "="*60) + print("EXAMPLE 4: Custom Salesforce Object") + print("="*60) + + has_creds, username, password, security_token = check_credentials() + if not has_creds: + return + + print("📝 This example shows how to write to custom Salesforce objects") + print(" Note: Make sure your custom object exists in Salesforce") + + spark = SparkSession.builder \ + .appName("SalesforceCustomObjectExample") \ + .config("spark.sql.shuffle.partitions", "2") \ + .getOrCreate() + + try: + from pyspark_datasources.salesforce import SalesforceDataSource + spark.dataSource.register(SalesforceDataSource) + + # Create sample data for custom object + streaming_df = spark.readStream \ + .format("rate") \ + .option("rowsPerSecond", 1) \ + .load() + + # Transform for custom object (example: Product__c) + custom_data = streaming_df.select( + col("value").cast("string").alias("Product_Code__c"), + lit("Sample Product").alias("Name"), + (col("value") * 29.99).cast("double").alias("Price__c"), + current_timestamp().alias("Created_Date__c") + ) + + print("📊 Example configuration for custom object...") + print(" Custom Object: Product__c") + print(" Fields: Product_Code__c, Name, Price__c, Created_Date__c") + print("\n Note: Uncomment and modify the following code for your custom object:") + + # Example code (commented out since custom object may not exist) + print(""" + query = custom_data.writeStream \\ + .format("salesforce") \\ + .option("username", username) \\ + .option("password", password) \\ + .option("security_token", security_token) \\ + .option("salesforce_object", "Product__c") \\ + .option("schema", "Product_Code__c STRING, Name STRING, Price__c DOUBLE, Created_Date__c TIMESTAMP") \\ + .option("batch_size", "20") \\ + .option("checkpointLocation", "/tmp/custom_object_checkpoint") \\ + .trigger(processingTime="10 seconds") \\ + .start() + """) + + print("✅ Custom object example configuration shown") + + except Exception as e: + print(f"❌ Error: {e}") + finally: + spark.stop() + +def main(): + """Run all examples""" + print("🚀 Salesforce Sink Examples") + print("This demonstrates various ways to use the Salesforce streaming sink") + + try: + # Run examples + example_1_rate_source_to_accounts() + example_2_csv_to_contacts() + example_3_checkpoint_demonstration() + example_4_custom_object() + + print("\n" + "="*60) + print("✅ All examples completed!") + print("="*60) + print("\n💡 Key takeaways:") + print(" - Salesforce sink supports various input sources (rate, CSV, etc.)") + print(" - Checkpoint functionality enables exactly-once processing") + print(" - Custom schemas allow flexibility for different Salesforce objects") + print(" - Batch processing optimizes Salesforce API usage") + print(" - Error handling provides fallback to individual record creation") + + except KeyboardInterrupt: + print("\n⏹️ Examples interrupted by user") + except Exception as e: + print(f"\n❌ Unexpected error: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_data_sources.py b/tests/test_data_sources.py index 6b29d62..54cb133 100644 --- a/tests/test_data_sources.py +++ b/tests/test_data_sources.py @@ -91,78 +91,3 @@ def test_salesforce_datasource_registration(spark): error_msg = str(e).lower() # The error can be about unsupported mode or missing writer assert "unsupported" in error_msg or "writer" in error_msg or "not implemented" in error_msg - - -def test_salesforce_datasource_stream_write(spark): - """Test Salesforce streaming write functionality with real credentials.""" - import os - import pytest - import tempfile - import shutil - - # Check for Salesforce credentials in environment variables - username = os.getenv('SALESFORCE_USERNAME') - password = os.getenv('SALESFORCE_PASSWORD') - security_token = os.getenv('SALESFORCE_SECURITY_TOKEN') - - if not all([username, password, security_token]): - pytest.skip("Salesforce credentials not found in environment variables. " - "Set SALESFORCE_USERNAME, SALESFORCE_PASSWORD, and SALESFORCE_SECURITY_TOKEN to run this test.") - - # Register the Salesforce DataSource - spark.dataSource.register(SalesforceDataSource) - - # Create test streaming data - checkpoint_dir = tempfile.mkdtemp(prefix="sf_test_") - - try: - streaming_df = spark.readStream \ - .format("rate") \ - .option("rowsPerSecond", 1) \ - .load() \ - .selectExpr( - "CONCAT('PySparkTest_', CAST(value as STRING)) as Name", - "'Technology' as Industry" - ) - - # Write to Salesforce with streaming query - query = streaming_df.writeStream \ - .format("salesforce") \ - .option("username", username) \ - .option("password", password) \ - .option("security_token", security_token) \ - .option("salesforce_object", "Account") \ - .option("batch_size", "5") \ - .option("checkpointLocation", checkpoint_dir) \ - .trigger(once=True) \ - .start() - - # Wait for completion - query.awaitTermination(timeout=15) - - if query.isActive: - query.stop() - - # Verify the query ran without critical errors - exception = query.exception() - if exception: - # Allow for environment issues but ensure DataSource was properly initialized - exception_str = str(exception) - if not any(env_issue in exception_str for env_issue in [ - "PYTHON_VERSION_MISMATCH", "PYTHON_DATA_SOURCE_ERROR" - ]): - pytest.fail(f"Unexpected streaming error: {exception_str}") - - # Clean up any test records created - try: - from simple_salesforce import Salesforce - sf = Salesforce(username=username, password=password, security_token=security_token) - results = sf.query("SELECT Id, Name FROM Account WHERE Name LIKE 'PySparkTest_%'") - for record in results['records']: - sf.Account.delete(record['Id']) - except: - pass # Clean-up is best effort - - finally: - # Clean up checkpoint directory - shutil.rmtree(checkpoint_dir, ignore_errors=True) diff --git a/tests/test_salesforce_sink.py b/tests/test_salesforce_sink.py deleted file mode 100644 index 39fee72..0000000 --- a/tests/test_salesforce_sink.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Test Salesforce DataSource checkpoint functionality with CSV file streaming (Auto Loader style) -""" -import os -import sys -import shutil -import time -import tempfile -import csv -sys.path.append('.') - -from pyspark.sql import SparkSession -from pyspark.sql.functions import col, lit -from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType - -def create_csv_file(file_path, data, headers): - """Create a CSV file with the given data""" - with open(file_path, 'w', newline='') as csvfile: - writer = csv.writer(csvfile) - writer.writerow(headers) - writer.writerows(data) - -def test_csv_checkpoint(): - """Test checkpoint functionality using Salesforce streaming write""" - - # Check credentials - username = os.getenv('SALESFORCE_USERNAME') - password = os.getenv('SALESFORCE_PASSWORD') - security_token = os.getenv('SALESFORCE_SECURITY_TOKEN') - - if not all([username, password, security_token]): - print("Missing Salesforce credentials in environment variables") - print("Please set: SALESFORCE_USERNAME, SALESFORCE_PASSWORD, SALESFORCE_SECURITY_TOKEN") - return False - - print("Using Salesforce credentials for user: {}".format(username)) - - # Create temporary directories - csv_dir = tempfile.mkdtemp(prefix="test_csv_") - checkpoint_location = tempfile.mkdtemp(prefix="test_checkpoint_csv_") - - print("CSV directory: {}".format(csv_dir)) - print("Checkpoint directory: {}".format(checkpoint_location)) - - try: - print("\n=== SETUP: Create Spark Session ===") - spark = SparkSession.builder \ - .appName("SalesforceCSVCheckpointTest") \ - .config("spark.sql.shuffle.partitions", "2") \ - .getOrCreate() - - print("Spark session created") - - # Register Salesforce DataSource - from pyspark_datasources.salesforce import SalesforceDataSource - spark.dataSource.register(SalesforceDataSource) - print("SalesforceDataSource registered") - - # Define schema for our CSV data - schema = StructType([ - StructField("id", IntegerType(), True), - StructField("name", StringType(), True), - StructField("industry", StringType(), True), - StructField("revenue", DoubleType(), True) - ]) - - headers = ["id", "name", "industry", "revenue"] - - print("\n=== PHASE 1: Create Initial CSV with 5 Records ===") - - # Create initial CSV data - initial_data = [ - [1, "CSV_Company_A", "Technology", 100000.0], - [2, "CSV_Company_B", "Finance", 200000.0], - [3, "CSV_Company_C", "Healthcare", 300000.0], - [4, "CSV_Company_D", "Manufacturing", 400000.0], - [5, "CSV_Company_E", "Retail", 500000.0] - ] - - # Create the first CSV file - initial_csv_path = os.path.join(csv_dir, "batch_001.csv") - create_csv_file(initial_csv_path, initial_data, headers) - - print("Created initial CSV file with 5 records:") - print("File: {}".format(initial_csv_path)) - - # Verify the CSV file was created correctly - test_df = spark.read.format("csv").option("header", "true").schema(schema).load(initial_csv_path) - print("Initial CSV content:") - test_df.show() - - print("\n=== PHASE 2: First Stream - Read Initial CSV and Write to Salesforce ===") - - # Create streaming read from CSV directory - streaming_df = spark.readStream \ - .format("csv") \ - .option("header", "true") \ - .schema(schema) \ - .load(csv_dir) - - # Transform for Salesforce Account format - salesforce_df = streaming_df.select( - col("name").alias("Name"), - col("industry").alias("Industry"), - col("revenue").alias("AnnualRevenue") - ) - - print("Starting first stream (should read 5 initial records)...") - - # Start first streaming query - query1 = salesforce_df.writeStream \ - .format("salesforce") \ - .option("username", username) \ - .option("password", password) \ - .option("security_token", security_token) \ - .option("salesforce_object", "Account") \ - .option("batch_size", "10") \ - .option("checkpointLocation", checkpoint_location) \ - .trigger(once=True) \ - .start() - - # Wait for completion - query1.awaitTermination(timeout=60) - - # Check for exceptions - if query1.exception() is not None: - print("First stream failed with exception: {}".format(query1.exception())) - return False - - # Get progress from first stream - first_progress = query1.lastProgress - first_records_processed = 0 - if first_progress: - print("\nFirst stream progress:") - print(" - Batch ID: {}".format(first_progress.get('batchId', 'N/A'))) - sources = first_progress.get('sources', []) - if sources: - source = sources[0] - first_records_processed = source.get('numInputRows', 0) - print(" - Records processed: {}".format(first_records_processed)) - print(" - End offset: {}".format(source.get('endOffset', 'N/A'))) - - print("First stream completed - processed {} records".format(first_records_processed)) - - print("\n=== PHASE 3: Add New CSV File with 5 New Records ===") - - # Wait a moment to ensure first stream is fully complete - time.sleep(3) - - # Create new CSV data - new_data = [ - [6, "CSV_Company_F", "Energy", 600000.0], - [7, "CSV_Company_G", "Education", 700000.0], - [8, "CSV_Company_H", "Agriculture", 800000.0], - [9, "CSV_Company_I", "Transportation", 900000.0], - [10, "CSV_Company_J", "Entertainment", 1000000.0] - ] - - # Create the second CSV file (simulating new data arrival) - new_csv_path = os.path.join(csv_dir, "batch_002.csv") - create_csv_file(new_csv_path, new_data, headers) - - print("Added new CSV file with 5 records:") - print("File: {}".format(new_csv_path)) - - # Verify total files in directory - csv_files = [f for f in os.listdir(csv_dir) if f.endswith('.csv')] - print("CSV files in directory: {}".format(csv_files)) - - # Verify total records across all files - all_df = spark.read.format("csv").option("header", "true").schema(schema).load(csv_dir) - total_records = all_df.count() - print("Total records across all CSV files: {}".format(total_records)) - - print("\n=== PHASE 4: Second Stream - Should Only Process New CSV File ===") - - # Create new streaming read from the same CSV directory - streaming_df2 = spark.readStream \ - .format("csv") \ - .option("header", "true") \ - .schema(schema) \ - .load(csv_dir) - - # Transform for Salesforce (same as before) - salesforce_df2 = streaming_df2.select( - col("name").alias("Name"), - col("industry").alias("Industry"), - col("revenue").alias("AnnualRevenue") - ) - - print("Starting second stream with same checkpoint (should only read new CSV file)...") - - # Start second streaming query with SAME checkpoint - query2 = salesforce_df2.writeStream \ - .format("salesforce") \ - .option("username", username) \ - .option("password", password) \ - .option("security_token", security_token) \ - .option("salesforce_object", "Account") \ - .option("batch_size", "10") \ - .option("checkpointLocation", checkpoint_location) \ - .trigger(once=True) \ - .start() - - # Wait for completion - query2.awaitTermination(timeout=60) - - # Check for exceptions - if query2.exception() is not None: - print("Second stream failed with exception: {}".format(query2.exception())) - return False - - # Get progress from second stream - second_progress = query2.lastProgress - second_records_processed = 0 - if second_progress: - print("\nSecond stream progress:") - print(" - Batch ID: {}".format(second_progress.get('batchId', 'N/A'))) - sources = second_progress.get('sources', []) - if sources: - source = sources[0] - second_records_processed = source.get('numInputRows', 0) - print(" - Records processed: {}".format(second_records_processed)) - print(" - Start offset: {}".format(source.get('startOffset', 'N/A'))) - print(" - End offset: {}".format(source.get('endOffset', 'N/A'))) - - print("Second stream completed - processed {} records".format(second_records_processed)) - - print("\n" + "=" * 60) - print("CHECKPOINT VERIFICATION RESULTS") - print("=" * 60) - print("Total CSV files created: {}".format(len(csv_files))) - print("Total records in all CSV files: {}".format(total_records)) - print("First stream processed: {} records".format(first_records_processed)) - print("Second stream processed: {} records".format(second_records_processed)) - print("Total records processed: {}".format(first_records_processed + second_records_processed)) - - # Analyze results - Only one case can be considered true success - if first_records_processed == 5 and second_records_processed == 5: - print("\n✅ SUCCESS: Checkpoint functionality working perfectly!") - print(" - First stream processed initial 5 records from batch_001.csv") - print(" - Second stream processed only the new 5 records from batch_002.csv") - print(" - No data was reprocessed (exactly-once semantics)") - print(" - Total: 10 unique records written to Salesforce") - return True - elif first_records_processed == 5 and second_records_processed == 0: - print("\n⚠️ PARTIAL: Checkpoint prevented reprocessing but no new data detected") - print(" - First stream processed initial 5 records correctly") - print(" - Second stream found no new records (timing issue or file not detected)") - print(" - This shows checkpoint prevents duplicates but doesn't prove full functionality") - return False - elif second_records_processed == total_records: - print("\n❌ CHECKPOINT FAILURE: Second stream reprocessed all data") - print(" - This indicates checkpoint was not properly restored") - print(" - Expected: Second stream should only process new files") - return False - else: - print("\n❌ UNEXPECTED RESULT:") - print(" - First stream: {} records (expected: 5)".format(first_records_processed)) - print(" - Second stream: {} records (expected: 5)".format(second_records_processed)) - print(" - Total available: {} records".format(total_records)) - print(" - Only success case: first=5, second=5 (exactly-once processing)") - return False - - except Exception as e: - print("Test failed with error: {}".format(str(e))) - import traceback - traceback.print_exc() - return False - - finally: - # Clean up - try: - if 'spark' in locals(): - spark.stop() - print("\nSpark session stopped") - except: - pass - - # Clean up directories - if os.path.exists(csv_dir): - shutil.rmtree(csv_dir) - print("Cleaned up CSV directory") - - if os.path.exists(checkpoint_location): - shutil.rmtree(checkpoint_location) - print("Cleaned up checkpoint directory") - -if __name__ == "__main__": - print("Testing Salesforce DataSource CSV-Based Checkpoint Functionality") - print("(Similar to Auto Loader pattern)") - print("=" * 60) - - success = test_csv_checkpoint() - - print("\n" + "=" * 60) - print("FINAL RESULTS") - print("=" * 60) - - if success: - print("✅ CSV checkpoint test PASSED!") - print(" The SalesforceDataSource correctly handles checkpoints with file-based streaming") - print(" This validates exactly-once processing semantics") - else: - print("❌ CSV checkpoint test FAILED!") - print(" Please check the error messages above") - sys.exit(1) \ No newline at end of file From 8e4e8c94a257f14dcf9d6555e4177e2ea7dad6a9 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Fri, 1 Aug 2025 12:48:02 -0700 Subject: [PATCH 8/9] rm sys --- examples/salesforce_sink_example.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/salesforce_sink_example.py b/examples/salesforce_sink_example.py index 1269d03..09a549f 100644 --- a/examples/salesforce_sink_example.py +++ b/examples/salesforce_sink_example.py @@ -28,9 +28,6 @@ from pyspark.sql.functions import col, lit, current_timestamp from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType -# Add the project root to Python path -sys.path.append('..') - def check_credentials(): """Check if Salesforce credentials are available""" username = os.getenv('SALESFORCE_USERNAME') From 9889a72c9bef03978118d1e0f2fd0d6e16b53d82 Mon Sep 17 00:00:00 2001 From: Shujing Yang Date: Fri, 1 Aug 2025 12:53:11 -0700 Subject: [PATCH 9/9] fix poetry.lock --- poetry.lock | 455 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 451 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 65dd70a..529ae8b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -154,7 +154,7 @@ description = "Classes Without Boilerplate" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"datasets\" or extra == \"all\"" +markers = "extra == \"datasets\" or extra == \"all\" or extra == \"salesforce\"" files = [ {file = "attrs-25.3.0-py3-none-any.whl", hash = "sha256:427318ce031701fea540783410126f03899a97ffc6f61596ad581ac2e40e3bc3"}, {file = "attrs-25.3.0.tar.gz", hash = "sha256:75d7cefc7fb576747b2c81b4442d4d4a1ce0900973527c011d1030fd3bf4af1b"}, @@ -227,6 +227,87 @@ files = [ {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, ] +[[package]] +name = "cffi" +version = "1.17.1" +description = "Foreign Function Interface for Python calling C code." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(extra == \"salesforce\" or extra == \"all\") and platform_python_implementation != \"PyPy\"" +files = [ + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, +] + +[package.dependencies] +pycparser = "*" + [[package]] name = "charset-normalizer" version = "3.4.1" @@ -439,6 +520,118 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli ; python_full_version <= \"3.11.0a6\""] +[[package]] +name = "cryptography" +version = "43.0.3" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version < \"3.11\" and (extra == \"salesforce\" or extra == \"all\")" +files = [ + {file = "cryptography-43.0.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf7a1932ac4176486eab36a19ed4c0492da5d97123f1406cf15e41b05e787d2e"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63efa177ff54aec6e1c0aefaa1a241232dcd37413835a9b674b6e3f0ae2bfd3e"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e1ce50266f4f70bf41a2c6dc4358afadae90e2a1e5342d3c08883df1675374f"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:443c4a81bb10daed9a8f334365fe52542771f25aedaf889fd323a853ce7377d6"}, + {file = "cryptography-43.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:74f57f24754fe349223792466a709f8e0c093205ff0dca557af51072ff47ab18"}, + {file = "cryptography-43.0.3-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9762ea51a8fc2a88b70cf2995e5675b38d93bf36bd67d91721c309df184f49bd"}, + {file = "cryptography-43.0.3-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:81ef806b1fef6b06dcebad789f988d3b37ccaee225695cf3e07648eee0fc6b73"}, + {file = "cryptography-43.0.3-cp37-abi3-win32.whl", hash = "sha256:cbeb489927bd7af4aa98d4b261af9a5bc025bd87f0e3547e11584be9e9427be2"}, + {file = "cryptography-43.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:f46304d6f0c6ab8e52770addfa2fc41e6629495548862279641972b6215451cd"}, + {file = "cryptography-43.0.3-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:8ac43ae87929a5982f5948ceda07001ee5e83227fd69cf55b109144938d96984"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:846da004a5804145a5f441b8530b4bf35afbf7da70f82409f151695b127213d5"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f996e7268af62598f2fc1204afa98a3b5712313a55c4c9d434aef49cadc91d4"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f7b178f11ed3664fd0e995a47ed2b5ff0a12d893e41dd0494f406d1cf555cab7"}, + {file = "cryptography-43.0.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c2e6fc39c4ab499049df3bdf567f768a723a5e8464816e8f009f121a5a9f4405"}, + {file = "cryptography-43.0.3-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e1be4655c7ef6e1bbe6b5d0403526601323420bcf414598955968c9ef3eb7d16"}, + {file = "cryptography-43.0.3-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:df6b6c6d742395dd77a23ea3728ab62f98379eff8fb61be2744d4679ab678f73"}, + {file = "cryptography-43.0.3-cp39-abi3-win32.whl", hash = "sha256:d56e96520b1020449bbace2b78b603442e7e378a9b3bd68de65c782db1507995"}, + {file = "cryptography-43.0.3-cp39-abi3-win_amd64.whl", hash = "sha256:0c580952eef9bf68c4747774cde7ec1d85a6e61de97281f2dba83c7d2c806362"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d03b5621a135bffecad2c73e9f4deb1a0f977b9a8ffe6f8e002bf6c9d07b918c"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a2a431ee15799d6db9fe80c82b055bae5a752bef645bba795e8e52687c69efe3"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:281c945d0e28c92ca5e5930664c1cefd85efe80e5c0d2bc58dd63383fda29f83"}, + {file = "cryptography-43.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f18c716be16bc1fea8e95def49edf46b82fccaa88587a45f8dc0ff6ab5d8e0a7"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:4a02ded6cd4f0a5562a8887df8b3bd14e822a90f97ac5e544c162899bc467664"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:53a583b6637ab4c4e3591a15bc9db855b8d9dee9a669b550f311480acab6eb08"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1ec0bcf7e17c0c5669d881b1cd38c4972fade441b27bda1051665faaa89bdcaa"}, + {file = "cryptography-43.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2ce6fae5bdad59577b44e4dfed356944fbf1d925269114c28be377692643b4ff"}, + {file = "cryptography-43.0.3.tar.gz", hash = "sha256:315b9001266a492a6ff443b61238f956b214dbec9910a081ba5b6646a055a805"}, +] + +[package.dependencies] +cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"] +nox = ["nox"] +pep8test = ["check-sdist", "click", "mypy", "ruff"] +sdist = ["build"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi", "cryptography-vectors (==43.0.3)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] + +[[package]] +name = "cryptography" +version = "45.0.5" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = true +python-versions = "!=3.9.0,!=3.9.1,>=3.7" +groups = ["main"] +markers = "python_version >= \"3.11\" and (extra == \"salesforce\" or extra == \"all\")" +files = [ + {file = "cryptography-45.0.5-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:101ee65078f6dd3e5a028d4f19c07ffa4dd22cce6a20eaa160f8b5219911e7d8"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3a264aae5f7fbb089dbc01e0242d3b67dffe3e6292e1f5182122bdf58e65215d"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e74d30ec9c7cb2f404af331d5b4099a9b322a8a6b25c4632755c8757345baac5"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3af26738f2db354aafe492fb3869e955b12b2ef2e16908c8b9cb928128d42c57"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e6c00130ed423201c5bc5544c23359141660b07999ad82e34e7bb8f882bb78e0"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:dd420e577921c8c2d31289536c386aaa30140b473835e97f83bc71ea9d2baf2d"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:d05a38884db2ba215218745f0781775806bde4f32e07b135348355fe8e4991d9"}, + {file = "cryptography-45.0.5-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:ad0caded895a00261a5b4aa9af828baede54638754b51955a0ac75576b831b27"}, + {file = "cryptography-45.0.5-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9024beb59aca9d31d36fcdc1604dd9bbeed0a55bface9f1908df19178e2f116e"}, + {file = "cryptography-45.0.5-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:91098f02ca81579c85f66df8a588c78f331ca19089763d733e34ad359f474174"}, + {file = "cryptography-45.0.5-cp311-abi3-win32.whl", hash = "sha256:926c3ea71a6043921050eaa639137e13dbe7b4ab25800932a8498364fc1abec9"}, + {file = "cryptography-45.0.5-cp311-abi3-win_amd64.whl", hash = "sha256:b85980d1e345fe769cfc57c57db2b59cff5464ee0c045d52c0df087e926fbe63"}, + {file = "cryptography-45.0.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:f3562c2f23c612f2e4a6964a61d942f891d29ee320edb62ff48ffb99f3de9ae8"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3fcfbefc4a7f332dece7272a88e410f611e79458fab97b5efe14e54fe476f4fd"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:460f8c39ba66af7db0545a8c6f2eabcbc5a5528fc1cf6c3fa9a1e44cec33385e"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9b4cf6318915dccfe218e69bbec417fdd7c7185aa7aab139a2c0beb7468c89f0"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2089cc8f70a6e454601525e5bf2779e665d7865af002a5dec8d14e561002e135"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:0027d566d65a38497bc37e0dd7c2f8ceda73597d2ac9ba93810204f56f52ebc7"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:be97d3a19c16a9be00edf79dca949c8fa7eff621763666a145f9f9535a5d7f42"}, + {file = "cryptography-45.0.5-cp37-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:7760c1c2e1a7084153a0f68fab76e754083b126a47d0117c9ed15e69e2103492"}, + {file = "cryptography-45.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6ff8728d8d890b3dda5765276d1bc6fb099252915a2cd3aff960c4c195745dd0"}, + {file = "cryptography-45.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7259038202a47fdecee7e62e0fd0b0738b6daa335354396c6ddebdbe1206af2a"}, + {file = "cryptography-45.0.5-cp37-abi3-win32.whl", hash = "sha256:1e1da5accc0c750056c556a93c3e9cb828970206c68867712ca5805e46dc806f"}, + {file = "cryptography-45.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:90cb0a7bb35959f37e23303b7eed0a32280510030daba3f7fdfbb65defde6a97"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:206210d03c1193f4e1ff681d22885181d47efa1ab3018766a7b32a7b3d6e6afd"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c648025b6840fe62e57107e0a25f604db740e728bd67da4f6f060f03017d5097"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b8fa8b0a35a9982a3c60ec79905ba5bb090fc0b9addcfd3dc2dd04267e45f25e"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:14d96584701a887763384f3c47f0ca7c1cce322aa1c31172680eb596b890ec30"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:57c816dfbd1659a367831baca4b775b2a5b43c003daf52e9d57e1d30bc2e1b0e"}, + {file = "cryptography-45.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b9e38e0a83cd51e07f5a48ff9691cae95a79bea28fe4ded168a8e5c6c77e819d"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8c4a6ff8a30e9e3d38ac0539e9a9e02540ab3f827a3394f8852432f6b0ea152e"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:bd4c45986472694e5121084c6ebbd112aa919a25e783b87eb95953c9573906d6"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:982518cd64c54fcada9d7e5cf28eabd3ee76bd03ab18e08a48cad7e8b6f31b18"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:12e55281d993a793b0e883066f590c1ae1e802e3acb67f8b442e721e475e6463"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:5aa1e32983d4443e310f726ee4b071ab7569f58eedfdd65e9675484a4eb67bd1"}, + {file = "cryptography-45.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:e357286c1b76403dd384d938f93c46b2b058ed4dfcdce64a770f0537ed3feb6f"}, + {file = "cryptography-45.0.5.tar.gz", hash = "sha256:72e76caa004ab63accdf26023fccd1d087f6d90ec6048ff33ad0445abf7f605a"}, +] + +[package.dependencies] +cffi = {version = ">=1.14", markers = "platform_python_implementation != \"PyPy\""} + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-inline-tabs ; python_full_version >= \"3.8.0\"", "sphinx-rtd-theme (>=3.0.0) ; python_full_version >= \"3.8.0\""] +docstest = ["pyenchant (>=3)", "readme-renderer (>=30.0)", "sphinxcontrib-spelling (>=7.3.1)"] +nox = ["nox (>=2024.4.15)", "nox[uv] (>=2024.3.2) ; python_full_version >= \"3.8.0\""] +pep8test = ["check-sdist ; python_full_version >= \"3.8.0\"", "click (>=8.0.1)", "mypy (>=1.4)", "ruff (>=0.3.6)"] +sdist = ["build (>=1.0.0)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["certifi (>=2024)", "cryptography-vectors (==45.0.5)", "pretend (>=0.7)", "pytest (>=7.4.0)", "pytest-benchmark (>=4.0)", "pytest-cov (>=2.10.1)", "pytest-xdist (>=3.5.0)"] +test-randomorder = ["pytest-randomly"] + [[package]] name = "databricks-sdk" version = "0.28.0" @@ -966,6 +1159,19 @@ files = [ {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +[[package]] +name = "isodate" +version = "0.7.2" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15"}, + {file = "isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6"}, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -1009,6 +1215,117 @@ hf-datasets = ["datasets", "pandas"] pandas-datasets = ["pandas"] signing = ["betterproto (>=2.0.0b6)", "model-signing", "sigstore (>=3.6.1)"] +[[package]] +name = "lxml" +version = "6.0.0" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:35bc626eec405f745199200ccb5c6b36f202675d204aa29bb52e27ba2b71dea8"}, + {file = "lxml-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:246b40f8a4aec341cbbf52617cad8ab7c888d944bfe12a6abd2b1f6cfb6f6082"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2793a627e95d119e9f1e19720730472f5543a6d84c50ea33313ce328d870f2dd"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:46b9ed911f36bfeb6338e0b482e7fe7c27d362c52fde29f221fddbc9ee2227e7"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2b4790b558bee331a933e08883c423f65bbcd07e278f91b2272489e31ab1e2b4"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2030956cf4886b10be9a0285c6802e078ec2391e1dd7ff3eb509c2c95a69b76"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d23854ecf381ab1facc8f353dcd9adeddef3652268ee75297c1164c987c11dc"}, + {file = "lxml-6.0.0-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:43fe5af2d590bf4691531b1d9a2495d7aab2090547eaacd224a3afec95706d76"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74e748012f8c19b47f7d6321ac929a9a94ee92ef12bc4298c47e8b7219b26541"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:43cfbb7db02b30ad3926e8fceaef260ba2fb7df787e38fa2df890c1ca7966c3b"}, + {file = "lxml-6.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34190a1ec4f1e84af256495436b2d196529c3f2094f0af80202947567fdbf2e7"}, + {file = "lxml-6.0.0-cp310-cp310-win32.whl", hash = "sha256:5967fe415b1920a3877a4195e9a2b779249630ee49ece22021c690320ff07452"}, + {file = "lxml-6.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:f3389924581d9a770c6caa4df4e74b606180869043b9073e2cec324bad6e306e"}, + {file = "lxml-6.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:522fe7abb41309e9543b0d9b8b434f2b630c5fdaf6482bee642b34c8c70079c8"}, + {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4ee56288d0df919e4aac43b539dd0e34bb55d6a12a6562038e8d6f3ed07f9e36"}, + {file = "lxml-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8dd6dd0e9c1992613ccda2bcb74fc9d49159dbe0f0ca4753f37527749885c25"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:d7ae472f74afcc47320238b5dbfd363aba111a525943c8a34a1b657c6be934c3"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5592401cdf3dc682194727c1ddaa8aa0f3ddc57ca64fd03226a430b955eab6f6"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:58ffd35bd5425c3c3b9692d078bf7ab851441434531a7e517c4984d5634cd65b"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f720a14aa102a38907c6d5030e3d66b3b680c3e6f6bc95473931ea3c00c59967"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c2a5e8d207311a0170aca0eb6b160af91adc29ec121832e4ac151a57743a1e1e"}, + {file = "lxml-6.0.0-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:2dd1cc3ea7e60bfb31ff32cafe07e24839df573a5e7c2d33304082a5019bcd58"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2cfcf84f1defed7e5798ef4f88aa25fcc52d279be731ce904789aa7ccfb7e8d2"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a52a4704811e2623b0324a18d41ad4b9fabf43ce5ff99b14e40a520e2190c851"}, + {file = "lxml-6.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c16304bba98f48a28ae10e32a8e75c349dd742c45156f297e16eeb1ba9287a1f"}, + {file = "lxml-6.0.0-cp311-cp311-win32.whl", hash = "sha256:f8d19565ae3eb956d84da3ef367aa7def14a2735d05bd275cd54c0301f0d0d6c"}, + {file = "lxml-6.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:b2d71cdefda9424adff9a3607ba5bbfc60ee972d73c21c7e3c19e71037574816"}, + {file = "lxml-6.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:8a2e76efbf8772add72d002d67a4c3d0958638696f541734304c7f28217a9cab"}, + {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78718d8454a6e928470d511bf8ac93f469283a45c354995f7d19e77292f26108"}, + {file = "lxml-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:84ef591495ffd3f9dcabffd6391db7bb70d7230b5c35ef5148354a134f56f2be"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:2930aa001a3776c3e2601cb8e0a15d21b8270528d89cc308be4843ade546b9ab"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da"}, + {file = "lxml-6.0.0-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16"}, + {file = "lxml-6.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0"}, + {file = "lxml-6.0.0-cp312-cp312-win32.whl", hash = "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a"}, + {file = "lxml-6.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3"}, + {file = "lxml-6.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:8cb26f51c82d77483cdcd2b4a53cda55bbee29b3c2f3ddeb47182a2a9064e4eb"}, + {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6da7cd4f405fd7db56e51e96bff0865b9853ae70df0e6720624049da76bde2da"}, + {file = "lxml-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b34339898bb556a2351a1830f88f751679f343eabf9cf05841c95b165152c9e7"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:51a5e4c61a4541bd1cd3ba74766d0c9b6c12d6a1a4964ef60026832aac8e79b3"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29"}, + {file = "lxml-6.0.0-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f"}, + {file = "lxml-6.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef"}, + {file = "lxml-6.0.0-cp313-cp313-win32.whl", hash = "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181"}, + {file = "lxml-6.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e"}, + {file = "lxml-6.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03"}, + {file = "lxml-6.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4eb114a0754fd00075c12648d991ec7a4357f9cb873042cc9a77bf3a7e30c9db"}, + {file = "lxml-6.0.0-cp38-cp38-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:7da298e1659e45d151b4028ad5c7974917e108afb48731f4ed785d02b6818994"}, + {file = "lxml-6.0.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7bf61bc4345c1895221357af8f3e89f8c103d93156ef326532d35c707e2fb19d"}, + {file = "lxml-6.0.0-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63b634facdfbad421d4b61c90735688465d4ab3a8853ac22c76ccac2baf98d97"}, + {file = "lxml-6.0.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e380e85b93f148ad28ac15f8117e2fd8e5437aa7732d65e260134f83ce67911b"}, + {file = "lxml-6.0.0-cp38-cp38-win32.whl", hash = "sha256:185efc2fed89cdd97552585c624d3c908f0464090f4b91f7d92f8ed2f3b18f54"}, + {file = "lxml-6.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:f97487996a39cb18278ca33f7be98198f278d0bc3c5d0fd4d7b3d63646ca3c8a"}, + {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261"}, + {file = "lxml-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281"}, + {file = "lxml-6.0.0-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a"}, + {file = "lxml-6.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5"}, + {file = "lxml-6.0.0-cp39-cp39-win32.whl", hash = "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256"}, + {file = "lxml-6.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae"}, + {file = "lxml-6.0.0-cp39-cp39-win_arm64.whl", hash = "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065"}, + {file = "lxml-6.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734"}, + {file = "lxml-6.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349"}, + {file = "lxml-6.0.0.tar.gz", hash = "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml_html_clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] + [[package]] name = "markdown" version = "3.7" @@ -1267,6 +1584,19 @@ mkdocs-autorefs = ">=1.4" mkdocstrings = ">=0.28.3" typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} +[[package]] +name = "more-itertools" +version = "10.7.0" +description = "More routines for operating on iterables, beyond itertools" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "more_itertools-10.7.0-py3-none-any.whl", hash = "sha256:d43980384673cb07d2f7d2d918c616b30c659c089ee23953f601d6609c67510e"}, + {file = "more_itertools-10.7.0.tar.gz", hash = "sha256:9fddd5403be01a94b204faadcff459ec3568cf110265d3c54323e1e866ad29d3"}, +] + [[package]] name = "multidict" version = "6.2.0" @@ -1825,6 +2155,19 @@ files = [ [package.dependencies] pyasn1 = ">=0.4.6,<0.7.0" +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "(extra == \"salesforce\" or extra == \"all\") and platform_python_implementation != \"PyPy\"" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + [[package]] name = "pygments" version = "2.19.1" @@ -1840,6 +2183,28 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pyjwt" +version = "2.10.1" +description = "JSON Web Token implementation in Python" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"}, + {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"}, +] + +[package.dependencies] +cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""} + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + [[package]] name = "pymdown-extensions" version = "10.14.3" @@ -2049,6 +2414,38 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-file" +version = "2.1.0" +description = "File transport adapter for Requests" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, + {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, +] + +[package.dependencies] +requests = ">=1.0.0" + +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + [[package]] name = "rsa" version = "4.9" @@ -2065,6 +2462,26 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "simple-salesforce" +version = "1.12.6" +description = "A basic Salesforce.com REST API client." +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "simple-salesforce-1.12.6.tar.gz", hash = "sha256:77590606c781905f6b75430562951dd2b062438da7f55fca2b61e4cde31df15b"}, + {file = "simple_salesforce-1.12.6-py2.py3-none-any.whl", hash = "sha256:66c74bee88d09ace46e4fc9c2f6b47c0d012817a764f70a5455d6dc2c7ed635c"}, +] + +[package.dependencies] +more-itertools = "*" +pyjwt = {version = "*", extras = ["crypto"]} +requests = ">=2.22.0" +typing-extensions = "*" +zeep = "*" + [[package]] name = "six" version = "1.17.0" @@ -2150,7 +2567,7 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"datasets\" or extra == \"all\" or python_version < \"3.11\"" +markers = "extra == \"datasets\" or extra == \"all\" or extra == \"salesforce\" or python_version < \"3.11\"" files = [ {file = "typing_extensions-4.13.0-py3-none-any.whl", hash = "sha256:c8dd92cc0d6425a97c18fbb9d1954e5ff92c1ca881a309c45f06ebc0b79058e5"}, {file = "typing_extensions-4.13.0.tar.gz", hash = "sha256:0a4ac55a5820789d87e297727d229866c9650f6521b64206413c4fbada24d95b"}, @@ -2461,6 +2878,35 @@ idna = ">=2.0" multidict = ">=4.0" propcache = ">=0.2.0" +[[package]] +name = "zeep" +version = "4.3.1" +description = "A Python SOAP client" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"salesforce\" or extra == \"all\"" +files = [ + {file = "zeep-4.3.1-py3-none-any.whl", hash = "sha256:a637aa7eedb6330bb27e8c94c5233ddf23553904323adf9398f8cf5025acb216"}, + {file = "zeep-4.3.1.tar.gz", hash = "sha256:f45385e9e1b09d5550e0f51ab9fa7c6842713cab7194139372fd82a99c56a06e"}, +] + +[package.dependencies] +attrs = ">=17.2.0" +isodate = ">=0.5.4" +lxml = ">=4.6.0" +platformdirs = ">=1.4.0" +pytz = "*" +requests = ">=2.7.0" +requests-file = ">=1.5.1" +requests-toolbelt = ">=0.7.1" + +[package.extras] +async = ["httpx (>=0.15.0)"] +docs = ["sphinx (>=1.4.0)"] +test = ["coverage[toml] (==7.6.2)", "flake8 (==7.1.1)", "flake8-blind-except (==0.2.1)", "flake8-debugger (==4.1.2)", "flake8-imports (==0.1.1)", "freezegun (==1.5.1)", "isort (==5.13.2)", "pretend (==1.0.9)", "pytest (==8.3.3)", "pytest-asyncio", "pytest-cov (==5.0.0)", "pytest-httpx", "requests-mock (==1.12.1)"] +xmlsec = ["xmlsec (>=0.6.1)"] + [[package]] name = "zipp" version = "3.21.0" @@ -2483,14 +2929,15 @@ test = ["big-O", "importlib-resources ; python_version < \"3.9\"", "jaraco.funct type = ["pytest-mypy"] [extras] -all = ["databricks-sdk", "datasets", "faker", "kagglehub"] +all = ["databricks-sdk", "datasets", "faker", "kagglehub", "simple-salesforce"] databricks = ["databricks-sdk"] datasets = ["datasets"] faker = ["faker"] kaggle = ["kagglehub"] lance = [] +salesforce = ["simple-salesforce"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "568b719c5b65713e7ed935ddac5d501f69f94c1f055addebfaae02e63bb17a23" +content-hash = "3fdb397bb1f9ea104625d5eb8f379396fabf5e30fb0923cd29b5638bdc9fc11d"