In [47]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

#authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="c6fb9fd9-644b-44c5-8e1f-2ea146326c95",
    resource_group_name="Alexander.Dunnett-rg",
    workspace_name="demo-alexplore"
)

In [48]:
ml_client

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7fd7e5c69ed0>,
         subscription_id=c6fb9fd9-644b-44c5-8e1f-2ea146326c95,
         resource_group_name=Alexander.Dunnett-rg,
         workspace_name=demo-alexplore)

In [5]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
import time

In [31]:
#Build data asset: data held in local location but now referenceable by compute
my_path = "./data/default_of_credit_card_clients.csv"
v1 = time.strftime("%Y.%m.%d.%H.%M.%S", time.gmtime())
#...version is timestamp :-)

my_data = Data(
    name="credit_card",
    version=v1,
    description="Credit card data",
    path=my_path,
    type=AssetTypes.URI_FILE
)

In [32]:
#check version information
print(my_data.version)
print(my_data.description)

2023.07.06.22.03.34
Credit card data


In [33]:
#create that asset:
ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'name': 'credit_card', 'description': 'Credit card data', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/c6fb9fd9-644b-44c5-8e1f-2ea146326c95/resourceGroups/Alexander.Dunnett-rg/providers/Microsoft.MachineLearningServices/workspaces/demo-alexplore/data/credit_card/versions/2023.07.06.22.03.34', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/cpu1-alex/code/Users/Alexander.Dunnett', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fd7ef9a7910>, 'serialize': <msrest.serialization.Serializer object at 0x7fd7e771e920>, 'version': '2023.07.06.22.03.34', 'latest_version': None, 'path': 'azureml://subscriptions/c6fb9fd9-644b-44c5-8e1f-2ea146326c95/resourcegroups/Alexander.Dunnett-rg/workspaces/demo-alexplore/datastores/workspaceblobs

**Import Data into Pandas:**

In [34]:
import pandas as pd
#%pip install -U azureml-fsspec
data_asset = ml_client.data.get(name="credit_card", version=v1)
df = pd.read_csv(data_asset.path, header = 1)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [38]:
#wrangle + commit to parquet! it's the best for tabular :-)
df.rename(columns={"default payment next month": "default"}, inplace=True)
df.drop("ID", axis=1, inplace=True)
df.to_parquet("./data/cleaned-credit-card.parquet")

In [40]:
v2 = v1 + "_cleaned"
my_path = "./data/cleaned-credit-card.parquet"
#update metadata for this version
my_data = Data(
    name="credit_card",
    version=v2,
    description="Default of credit card clients data (cleaned)",
    path=my_path,
    type=AssetTypes.URI_FILE
)
my_data = ml_client.data.create_or_update(my_data)

Uploading cleaned-credit-card.parquet (< 1 MB): 0.00B [00:00, ?B/s]Uploading cleaned-credit-card.parquet (< 1 MB): 100%|██████████| 1.58M/1.58M [00:00<00:00, 26.6MB/s]




In [44]:
#now read data in from the cleaned .parquet format file
data_asset = ml_client.data.get(name="credit_card", version=v2)
df = pd.read_parquet(data_asset.path)
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
