In [1]:
import os
from dotenv import load_dotenv
from typing import List
from datahub.emitter.mcp import MetadataChangeProposalWrapper
import datahub.metadata.schema_classes as models
import datahub.emitter.mce_builder as builder
from datahub.emitter.rest_emitter import DatahubRestEmitter
load_dotenv()

# replace .env_example with a file called .env and add your own environment variables into it.
datashub_gms_server = os.getenv('DATAHUB_GMS_SERVER', '')
datahub_token = os.getenv('DATAHUB_TOKEN', '')
datahub_actor = os.getenv('DATAHUB_ACTOR', 'urn:li:corpuser:admin')

# start by putting things into DEV to keep PROD clean until you know what you are doing
datahub_env = 'DEV'

### Create emitter

In [2]:
# create an emitter
emitter = DatahubRestEmitter(
    gms_server=datashub_gms_server, 
    token=datahub_token, 
    extra_headers={'X-DataHub-Actor': datahub_actor}
)

### Create a tag

In [3]:
# create a tag
tag_name = 'healthy'
tag_description = 'Healthy'
mcpw = MetadataChangeProposalWrapper(
    "tag",
    models.ChangeTypeClass.UPSERT,
    entityUrn=builder.make_tag_urn(tag_name),
    aspectName="tagProperties",
    aspect=models.TagPropertiesClass(tag_name, tag_description)
)
emitter.emit_mcp(mcp=mcpw)

### Create a user

In [4]:
# create a user
user_name = 'Joe Bloggs'
user_email = 'joe.bloggs@fake.com'
mcpw = MetadataChangeProposalWrapper(
    "corpUser",
    models.ChangeTypeClass.UPSERT,
    entityUrn=builder.make_user_urn(username=user_name),
    aspectName="corpUserInfo",
    aspect=models.CorpUserInfoClass(active=True, displayName=user_name, email=user_email)
)
emitter.emit_mcp(mcp=mcpw)

### Create a dataset

In [5]:
# create a dataset
dataset_platform = 'bigquery'
dataset_name = 'project_a.dataset_a.table_a'
dataset_description = 'my great dataset'
dataset_url = 'https://www.google.ie/'
dataset_urn = builder.make_dataset_urn(platform=dataset_platform, name=dataset_name, env=datahub_env)
mcpw = MetadataChangeProposalWrapper(
    "dataset",
    models.ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="datasetProperties",
    aspect=models.DatasetPropertiesClass(description=dataset_description, externalUrl=dataset_url)
)
emitter.emit_mcp(mcp=mcpw)

### Tag a dataset

In [6]:
# tag dataset as 'healthy'
mcpw = MetadataChangeProposalWrapper(
    "dataset",
    models.ChangeTypeClass.UPSERT,
    entityUrn=builder.make_dataset_urn(platform=dataset_platform, name=dataset_name, env=datahub_env),
    aspectName="globalTags",
    aspect=models.GlobalTagsClass(tags=[models.TagAssociationClass(builder.make_tag_urn('healthy'))])
)
emitter.emit_mcp(mcp=mcpw)

### Add owner to a dataset

In [7]:
# set dataset owner as "Joe Blogss"
mcpw = MetadataChangeProposalWrapper(
    "dataset",
    models.ChangeTypeClass.UPSERT,
    entityUrn=builder.make_dataset_urn(platform=dataset_platform, name=dataset_name, env=datahub_env),
    aspectName="ownership",
    aspect=models.OwnershipClass(owners=[models.OwnerClass(builder.make_user_urn(user_name), type='DATAOWNER')])
)
emitter.emit_mcp(mcp=mcpw)

### Add lineage to dataset

In [8]:
# create upstream dataset
dataset_platform_b = 'bigquery'
dataset_name_b = 'project_a.dataset_a.table_b'
dataset_description_b = 'my great dataset b'
dataset_url_b = 'https://www.google.ie/'
dataset_urn_b = builder.make_dataset_urn(platform=dataset_platform_b, name=dataset_name_b, env=datahub_env)
mcpw = MetadataChangeProposalWrapper(
    "dataset",
    models.ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn_b,
    aspectName="datasetProperties",
    aspect=models.DatasetPropertiesClass(description=dataset_description_b, externalUrl=dataset_url_b)
)
emitter.emit_mcp(mcp=mcpw)

# create lineage
upstream_datasets = [dataset_urn_b]
mcpw = MetadataChangeProposalWrapper(
    "dataset",
    models.ChangeTypeClass.UPSERT,
    entityUrn=dataset_urn,
    aspectName="upstreamLineage",
    aspect=models.UpstreamLineageClass(upstreams=[models.UpstreamClass(dataset=d, type=models.DatasetLineageTypeClass.TRANSFORMED) for d in upstream_datasets])
    )
emitter.emit_mcp(mcp=mcpw)

### Add link to a dataset

In [9]:
# add a link to dataset
#mcpw = MetadataChangeProposalWrapper(
#    "dataset",
#    models.ChangeTypeClass.UPSERT,
#    entityUrn=builder.make_dataset_urn(platform=dataset_platform, name=dataset_name, env=datahub_env),
#    aspectName="addLink",
#    aspect=??? TODO
#)
#emitter.emit_mcp(mcp=mcpw)