In [1]:
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Point to your DataHub REST endpoint
emitter = DatahubRestEmitter("http://localhost:8080")

In [2]:
from datahub.metadata.schema_classes import DatasetSnapshotClass, DatasetPropertiesClass, MetadataChangeEventClass

# Create a dataset URN
dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:spark,example_dataset,PROD)"

# Define dataset properties
snapshot = DatasetSnapshotClass(
    urn=dataset_urn,
    aspects=[
        DatasetPropertiesClass(
            description="This dataset is an example generated by PySpark",
            customProperties={"key": "value"}
        )
    ]
)

# Wrap in a Metadata Change Event (MCE)
mce = MetadataChangeEventClass(proposedSnapshot=snapshot)

# Emit the MCE
emitter.emit_mce(mce)


In [3]:
from datahub.metadata.schema_classes import UpstreamLineageClass, UpstreamClass

# Define input and output datasets
input_urn = "urn:li:dataset:(urn:li:dataPlatform:spark,input_dataset,PROD)"
output_urn = "urn:li:dataset:(urn:li:dataPlatform:spark,output_dataset,PROD)"

# Define upstream lineage
lineage = UpstreamLineageClass(
    upstreams=[
        UpstreamClass(
            dataset=input_urn,
            type="TRANSFORMED"
        )
    ]
)

# Add lineage to the output dataset
snapshot = DatasetSnapshotClass(
    urn=output_urn,
    aspects=[lineage]
)

# Emit the lineage
mce = MetadataChangeEventClass(proposedSnapshot=snapshot)
emitter.emit_mce(mce)

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
          .master("local[*]") \
          .appName("spark-datahub-example") \
          .config("spark.jars.packages","io.acryl:datahub-spark-lineage:0.14.0") \
          .config("spark.extraListeners","datahub.spark.DatahubSparkListener") \
          .config("spark.datahub.rest.server", "http://localhost:8080") \
          .enableHiveSupport() \
          .getOrCreate()

25/01/09 18:08:55 WARN Utils: Your hostname, baptvit resolves to a loopback address: 127.0.1.1; using 192.168.2.129 instead (on interface wlp4s0)
25/01/09 18:08:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/baptvit/.ivy2/cache
The jars for the packages stored in: /home/baptvit/.ivy2/jars
io.acryl#datahub-spark-lineage added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1753987f-c714-466c-b522-7cb6afefe29a;1.0
	confs: [default]
	found io.acryl#datahub-spark-lineage;0.14.0 in central
:: resolution report :: resolve 61ms :: artifacts dl 2ms
	:: modules in use:
	io.acryl#datahub-spark-lineage;0.14.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-1753987f-c714-466c-b522-7cb6af

25/01/09 18:08:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/09 18:08:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


In [5]:
spark

In [6]:
from datahub.emitter.mce_builder import make_dataset_urn
from datahub.metadata.schema_classes import DatasetSnapshotClass, MetadataChangeEventClass

# Load a dataset
df = spark.read.csv("industry.csv", header=True, inferSchema=True)

# # Capture dataset metadata
# dataset_urn = make_dataset_urn("csv", "industry.csv", "PROD")

# # Create a DatasetSnapshot
# dataset_snapshot = DatasetSnapshotClass(
#     urn=dataset_urn,
#     aspects=[],  # Add relevant aspects like schema, ownership, etc.
# )

# # Create a MetadataChangeEvent
# mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)

# # Emit the metadata to DataHub
# emitter.emit(mce)

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


In [4]:
df.show()

+--------------------+
|            Industry|
+--------------------+
|  Accounting/Finance|
|Advertising/Publi...|
|  Aerospace/Aviation|
|Arts/Entertainmen...|
|          Automotive|
|    Banking/Mortgage|
|Business Development|
|Business Opportunity|
|Clerical/Administ...|
|Construction/Faci...|
|      Consumer Goods|
|    Customer Service|
|  Education/Training|
|    Energy/Utilities|
|         Engineering|
| Government/Military|
|               Green|
|          Healthcare|
|  Hospitality/Travel|
|     Human Resources|
+--------------------+
only showing top 20 rows



In [7]:
df_transformed = transformed_df = df.withColumn("new_column_Industry", df["Industry"] * 2)
df_transformed.write.csv("output.csv")

In [8]:
# Emit input and output datasets
input_urn = "urn:li:dataset:(urn:li:dataPlatform:file,input_csv,PROD)"
output_urn = "urn:li:dataset:(urn:li:dataPlatform:file,output_csv,PROD)"

# Log lineage
lineage = UpstreamLineageClass(
    upstreams=[
        UpstreamClass(
            dataset=input_urn,
            type="TRANSFORMED"
        )
    ]
)

snapshot = DatasetSnapshotClass(
    urn=output_urn,
    aspects=[lineage]
)
mce = MetadataChangeEventClass(proposedSnapshot=snapshot)
emitter.emit_mce(mce)

In [5]:
from datahub.emitter.rest_emitter import DatahubRestEmitter

# Initialize the DataHub REST emitter
emitter = DatahubRestEmitter(gms_server="http://localhost:8080")

In [6]:
# Capture dataset metadata
dataset_urn = make_dataset_urn("csv", "industry.csv", "PROD")

# Create a DatasetSnapshot
dataset_snapshot = DatasetSnapshotClass(
    urn=dataset_urn,
    aspects=[],  # Add relevant aspects like schema, ownership, etc.
)

# Create a MetadataChangeEvent
mce = MetadataChangeEventClass(proposedSnapshot=dataset_snapshot)

# Emit the metadata to DataHub
emitter.emit(mce)

In [7]:
from datahub.metadata.schema_classes import SchemaMetadataClass, SchemaFieldClass

# Extract schema from the DataFrame
schema_fields = [
    SchemaFieldClass(fieldPath=field.name, type=str(field.dataType), nativeDataType=str(field.dataType))
    for field in df.schema.fields
]

# Create schema metadata
schema_metadata = SchemaMetadataClass(
    schemaName="industry_example_test",
    platform="csv",
    version=0,
    fields=schema_fields,
    hash="123",
    platformSchema="spark",
)

# Add schema metadata to the dataset snapshot
dataset_snapshot.aspects.append(schema_metadata)

# Emit the updated metadata
emitter.emit(MetadataChangeEventClass(proposedSnapshot=dataset_snapshot))

AvroTypeException: (<avro.schema.RecordSchema object at 0x723daf589ed0>, DatasetSnapshotClass({'urn': 'urn:li:dataset:(urn:li:dataPlatform:csv,industry.csv,PROD)', 'aspects': [SchemaMetadataClass({'schemaName': 'industry_example_test', 'platform': 'csv', 'version': 0, 'created': AuditStampClass({'time': 0, 'actor': 'urn:li:corpuser:unknown', 'impersonator': None, 'message': None}), 'lastModified': AuditStampClass({'time': 0, 'actor': 'urn:li:corpuser:unknown', 'impersonator': None, 'message': None}), 'deleted': None, 'dataset': None, 'cluster': None, 'hash': '123', 'platformSchema': 'spark', 'fields': [SchemaFieldClass({'fieldPath': 'Industry', 'jsonPath': None, 'nullable': False, 'description': None, 'label': None, 'created': None, 'lastModified': None, 'type': 'StringType()', 'nativeDataType': 'StringType()', 'recursive': False, 'globalTags': None, 'glossaryTerms': None, 'isPartOfKey': False, 'isPartitioningKey': None, 'jsonProps': None})], 'primaryKeys': None, 'foreignKeysSpecs': None, 'foreignKeys': None})]}))

In [8]:
from datahub.metadata.schema_classes import UpstreamClass, DatasetLineageTypeClass

# Example: Transform the dataset
transformed_df = df.withColumn("new_column_Industry", df["Industry"] * 2)

# Capture lineage information
upstream = UpstreamClass(
    dataset=dataset_urn,
    type=DatasetLineageTypeClass.TRANSFORMED,
)

# Create a lineage snapshot for the transformed dataset
transformed_dataset_urn = make_dataset_urn("csv", "path/to/transformed_dataset.csv", "PROD")
lineage_snapshot = DatasetSnapshotClass(
    urn=transformed_dataset_urn,
    aspects=[upstream],
)

# Emit the lineage metadata
emitter.emit(MetadataChangeEventClass(proposedSnapshot=lineage_snapshot))

AvroTypeException: (<avro.schema.RecordSchema object at 0x723daf589ed0>, DatasetSnapshotClass({'urn': 'urn:li:dataset:(urn:li:dataPlatform:csv,path/to/transformed_dataset.csv,PROD)', 'aspects': [UpstreamClass({'auditStamp': AuditStampClass({'time': 0, 'actor': 'urn:li:corpuser:unknown', 'impersonator': None, 'message': None}), 'created': None, 'dataset': 'urn:li:dataset:(urn:li:dataPlatform:csv,industry.csv,PROD)', 'type': 'TRANSFORMED', 'properties': None, 'query': None})]}))

In [17]:
df.write.parquet("output_v2") # or df.write.saveAsTable("my_table")

In [15]:
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.metadata.schema_classes import DatasetPropertiesClass, OtherSchemaClass, SchemaMetadataClass, SchemaFieldClass, DataPlatformInstanceClass
from datahub.metadata.schema_classes import DataProcessInstancePropertiesClass, DataProcessInstanceRunEventClass, StatusClass
from datahub.metadata.schema_classes import DataFlowClass, DataJobClass

# Perform some Spark operations
df.write.parquet("/tmp/output_v2") # or df.write.saveAsTable("my_table")

# Emit dataflow and datajob manually for more control
dataflow_urn = "urn:li:dataFlow:(spark,my_spark_dataflow,prod)"
datajob_urn = "urn:li:dataJob:(spark,my_spark_dataflow,my_spark_job)"

dataflow_mcp = MetadataChangeProposalWrapper(
    entityUrn=dataflow_urn,
    aspect=DataFlowClass(name="my_spark_dataflow", platform="spark"),
    changeCategory=ChangeCategory.CREATE,
)
emitter.emit_mcp(dataflow_mcp)

datajob_mcp = MetadataChangeProposalWrapper(
    entityUrn=datajob_urn,
    aspect=DataJobClass(name="my_spark_job", flow=dataflow_urn, platform="spark"),
    changeCategory=ChangeCategory.CREATE,
)
emitter.emit_mcp(datajob_mcp)

# Stop Spark Session
spark.stop()

# Flush the emitter to ensure all metadata is sent
emitter.flush()


ImportError: cannot import name 'DataFlowClass' from 'datahub.metadata.schema_classes' (/home/baptvit/Documents/github/lakehouse-labs/.venv/lib/python3.11/site-packages/datahub/metadata/schema_classes.py)