In [1]:
# Necessary imports

import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

##### Authentication details

In [2]:
credential = DefaultAzureCredential()

ml_client = MLClient(
    credential=credential,
    subscription_id="d33b5db8-15df-4c49-9e44-ff5ac32d6bb6", # Azure for Students subscription
    workspace_name="AzureMLOps", 
    resource_group_name="bandsmlopsfall2025" 
)

##### Create data asset - with two versions for two CSVs

In [3]:
# First dataset version

dataset_path = 'data/telecom_churn_v1.csv'

load_data = Data(
    path=dataset_path,
    type=AssetTypes.URI_FILE,
    description="Telecom churn dataset with multiple CSV versions",
    name="telecom_churn_dataset",
)

data_asset_job = ml_client.data.create_or_update(
    data=load_data
)

[32mUploading telecom_churn_v1.csv[32m (< 1 MB): 0.00B [00:00, ?B/s][32mUploading telecom_churn_v1.csv[32m (< 1 MB): 100%|██████████| 84.7k/84.7k [00:00<00:00, 3.44MB/s]
[39m



In [4]:
# Second dataset version, same data asset

dataset_path = 'data/telecom_churn_v2.csv'  

load_data = Data(
    path=dataset_path,
    type=AssetTypes.URI_FILE,
    description="Telecom churn dataset with multiple CSV versions",
    name="telecom_churn_dataset", # using same dataset name
)

data_asset_job = ml_client.data.create_or_update(
    data=load_data
)

[32mUploading telecom_churn_v2.csv[32m (< 1 MB): 0.00B [00:00, ?B/s][32mUploading telecom_churn_v2.csv[32m (< 1 MB): 100%|██████████| 129k/129k [00:00<00:00, 10.9MB/s]
[39m



##### Access both dataset versions with basic exploration

In [8]:
# Version 1

data_asset = ml_client.data.get(
    name="telecom_churn_dataset", 
    version="1" # version 1, telecom_churn_v1.csv
)
# print(data_asset)

v1_df = pd.read_csv(data_asset.path)
print("Shape: ", v1_df.shape)
v1_df.head(10)

  mlflow.mismatch._check_version_mismatch()


Shape:  (2192, 11)


Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1
5,0,118,0,0,0.0,0,223.4,98,57.0,11.03,6.3
6,0,121,1,1,2.03,3,218.2,88,87.3,17.43,7.5
7,0,147,0,0,0.0,0,157.0,79,36.0,5.16,7.1
8,0,117,1,0,0.19,1,184.5,97,63.9,17.58,8.7
9,0,141,0,1,3.02,0,258.6,84,93.2,11.1,11.2


In [9]:
# find just number of rows in version 1
len(v1_df)

2192

In [10]:
# Version 2

data_asset = ml_client.data.get(
    name="telecom_churn_dataset", 
    version="2" # version 2, telecom_churn_v2.csv
)
# print(data_asset)

v2_df = pd.read_csv(data_asset.path)
print("Shape: ", v2_df.shape)
v2_df.head(10)

Shape:  (3333, 11)


Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1
5,0,118,0,0,0.0,0,223.4,98,57.0,11.03,6.3
6,0,121,1,1,2.03,3,218.2,88,87.3,17.43,7.5
7,0,147,0,0,0.0,0,157.0,79,36.0,5.16,7.1
8,0,117,1,0,0.19,1,184.5,97,63.9,17.58,8.7
9,0,141,0,1,3.02,0,258.6,84,93.2,11.1,11.2


In [11]:
# find just number of rows in version 2
len(v2_df)

3333