# Create a Data Asset
There are three types of data assets at Azure. URI_FILE points to a file, URI_FOLDER points to a folder and MLTABLE points to an mltable.  

In [1]:
pip show azure-ai-ml

Name: azure-ai-ml
Version: 1.16.1
Summary: Microsoft Azure Machine Learning Client Library for Python
Home-page: https://github.com/Azure/azure-sdk-for-python
Author: Microsoft Corporation
Author-email: azuresdkengsysadmins@microsoft.com
License: MIT License
Location: /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages
Requires: azure-common, azure-core, azure-mgmt-core, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, opencensus-ext-azure, opencensus-ext-logging, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions
Required-by: 
Note: you may need to restart the kernel to use updated packages.


### Connect to your Workspace
Before doing anything, you should establish a connection with your workspace in the cloud.

In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# My subscription id, resrouce group and workspace are all in the file below.
import config

In [2]:
ml_client = MLClient(
    DefaultAzureCredential(), config.subscription_id, config.resource_group, config.workspace
)

In [3]:
# Check your existing datastores
stores = ml_client.datastores.list()
for ds_name in stores:
    print(ds_name.name)

workspaceblobstore
workspaceworkingdirectory
workspaceartifactstore
workspacefilestore


In [4]:
# Check your existing datasets
datasets = ml_client.data.list()
for ds_name in datasets:
    print(ds_name.name)

titanic_ds
titanic_ds2
titanic_ds4


## Creating a dataset first
I first download the data from the web, process it (dropping columns etc.), and then save it to a folder. I will register this dataset as a data asset on Azure later.

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Reading the dataset from internet
df = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv')

# We need to one-hot encode the sex column drop the name column.

df = pd.get_dummies(df, columns = ['Sex'], dtype=int)
df = df.drop('Name', axis=1)
le = LabelEncoder()
df['Survived'] = le.fit_transform(df['Survived'])

df.to_csv('data/titanic_ds.csv', index=False)

## Creating a MLTable
This code prepares an MLTable and makes it available for model training.

In [16]:
import mltable
from mltable import MLTableHeaders, MLTableFileEncoding, DataType
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

# create paths to the data files
paths = [{"file": "data/titanic_ds.csv"}]

# create an MLTable from the data files
tbl = mltable.from_delimited_files(
    paths=paths,
    delimiter=",",
    header=MLTableHeaders.all_files_same_headers,
    infer_column_types=True,
    include_path_column=False,
    encoding=MLTableFileEncoding.utf8,
)

# show the first few records
print(tbl.show())

# save the data loading steps in an MLTable file
mltable_folder = "./titanic"
tbl.save(mltable_folder)


# Define the Data asset object
my_data = Data(
    path=mltable_folder,
    type=AssetTypes.MLTABLE,
    description="titanic data in an mltable",
    name="titanic_ds5",
    version='1',
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)

    Survived  Pclass   Age  Siblings/Spouses Aboard  Parents/Children Aboard  \
0      False       3  22.0                        1                        0   
1       True       1  38.0                        1                        0   
2       True       3  26.0                        0                        0   
3       True       1  35.0                        1                        0   
4      False       3  35.0                        0                        0   
5      False       3  27.0                        0                        0   
6      False       1  54.0                        0                        0   
7      False       3   2.0                        3                        1   
8       True       3  27.0                        0                        2   
9       True       2  14.0                        1                        0   
10      True       3   4.0                        1                        1   
11      True       1  58.0              

Uploading titanic (0.0 MBs): 100%|██████████| 328/328 [00:00<00:00, 17654.34it/s]




Data({'path': 'azureml://subscriptions/a54b1e51-86a2-4073-b2a5-1a79c43cf955/resourcegroups/model_dep/workspaces/ml-workspace/datastores/workspaceblobstore/paths/LocalUpload/d9e580932cff2df8d3d5af3abd16e778/titanic/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['../data/titanic_ds.csv'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_ds5', 'description': 'titanic data in an mltable', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/a54b1e51-86a2-4073-b2a5-1a79c43cf955/resourceGroups/model_dep/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace/data/titanic_ds5/versions/1', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/sckaraman1/code/Users/sckaraman', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f8a5e753010>, 'serialize': <msrest.serialization.Serializer object at 

In [11]:
# Here is how to create an MLTable from a local file

from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

local_path = 'data/titanic'

my_data = Data(
    path=local_path,
    type=AssetTypes.MLTABLE,
    description="MLTable pointing to titanic_ds.csv in data folder",
    name="titanic_ds2"
)

ml_client.data.create_or_update(my_data)

Data({'path': 'azureml://subscriptions/a54b1e51-86a2-4073-b2a5-1a79c43cf955/resourcegroups/model_dep/workspaces/ml-workspace/datastores/workspaceblobstore/paths/LocalUpload/4205165e1818a7060b11cd2dad95f4fe/titanic/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['../titanic_ds.csv'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_ds2', 'description': 'MLTable pointing to titanic_ds.csv in data folder', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/a54b1e51-86a2-4073-b2a5-1a79c43cf955/resourceGroups/model_dep/providers/Microsoft.MachineLearningServices/workspaces/ml-workspace/data/titanic_ds2/versions/1', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/sckaraman1/code/Users/sckaraman', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f88df64f550>, 'serialize': <msrest.serialization.Ser