# Overview

The following example shows how to create a DataCard, register it, and then load it/other DataCards for collaboration and reproducibility

In [1]:
import os

os.environ["GOOGLE_ACCOUNT_JSON_BASE64"] = "service account credentials (see pinned chat in slack or use gcloud sdk)"
os.environ["POETRY_HTTP_BASIC_SHIPT_RESOLVE_USERNAME"]="secret username"
os.environ["POETRY_HTTP_BASIC_SHIPT_RESOLVE_PASSWORD"]="secret pass"

In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://steven.forrester:****@artifactory.shipt.com/artifactory/api/pypi/pypi-virtual/simple






In [4]:
from opsml_data import DataCard, DataRegistry
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Instantiate data registry

In [6]:
registry = DataRegistry()

#### Example 1: Creating data (with index splits) and registering it to the data registry
Create fake data

In [7]:
mu_1, mu_2 = -4, 4
X_data = np.random.normal(mu_1, 2.0, size=(1000, 10))
y_data = np.random.randint(2, 100, size=(1000, 1))

col_names = []
for i in range(0, X_data.shape[1]):
    col_names.append(f"col_{i}")

# Create dataframe
data = pd.DataFrame(X_data, columns=col_names)
data["target"] = y_data
data.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,target
0,-6.59036,-5.233745,-5.924657,-2.015092,-4.711773,-4.926473,-2.738224,-6.734288,-9.754382,-1.089184,15
1,-0.883489,-1.165218,-4.247211,-4.914167,-6.794264,-1.35362,-2.107597,-3.812282,-3.456895,-2.081096,89
2,-3.225146,-4.748191,-6.401323,-3.619873,-2.747665,-0.990086,-2.232408,-1.829894,-2.661967,-4.343444,33
3,-0.921779,-6.464419,-5.204225,-3.3894,-3.625992,-5.302237,-2.650269,-2.989051,-3.026891,-5.136381,72
4,-2.342103,-2.851189,-4.924048,-3.340289,-1.976849,-2.500482,-2.819632,0.279637,-5.222875,-6.352272,83


##### Create train test splits

In [8]:
train_idx, test_idx = train_test_split(np.arange(data.shape[0]), test_size=0.3)

##### Create DataCard
- check out the docstring for input specifications
- Required arguments:
    - data: pandas dataframe, numpy array, or pyarrow table
    - data_name: Name for the data
    - team: team name
    - user_email: User email

In [10]:
DATA_NAME = "synthetic_data"
TEAM = "SPMS"
USER_EMAIL = "steven.forrester@shipt.com"
DATA_SPLITS = [
    {"label":"train", "indices": train_idx},
    {"label":"test", "indices": test_idx}
]

data_card = DataCard(
    data_name=DATA_NAME, 
    team=TEAM, 
    user_email=USER_EMAIL, 
    data=data, 
    data_splits=DATA_SPLITS
)

# confirm data
data_card.data

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,target
0,-6.590360,-5.233745,-5.924657,-2.015092,-4.711773,-4.926473,-2.738224,-6.734288,-9.754382,-1.089184,15
1,-0.883489,-1.165218,-4.247211,-4.914167,-6.794264,-1.353620,-2.107597,-3.812282,-3.456895,-2.081096,89
2,-3.225146,-4.748191,-6.401323,-3.619873,-2.747665,-0.990086,-2.232408,-1.829894,-2.661967,-4.343444,33
3,-0.921779,-6.464419,-5.204225,-3.389400,-3.625992,-5.302237,-2.650269,-2.989051,-3.026891,-5.136381,72
4,-2.342103,-2.851189,-4.924048,-3.340289,-1.976849,-2.500482,-2.819632,0.279637,-5.222875,-6.352272,83
...,...,...,...,...,...,...,...,...,...,...,...
995,-5.025207,-2.186259,-1.376174,-5.716522,-2.010443,-3.211734,-4.132141,-6.132935,-4.826085,-6.392173,92
996,-0.113847,-4.881833,1.036072,-6.108453,-4.323369,-3.695348,-3.761904,-5.312885,-4.776925,-6.400060,78
997,-3.242393,-5.326544,-5.883825,-0.758710,-0.988258,-3.305807,-2.692970,-4.409379,-5.329565,-2.583064,64
998,-4.844521,-1.359723,-2.479724,-7.182807,-0.769411,-7.325498,-3.026096,0.395371,-5.380634,-5.840678,54


##### Confirm data splits
- data_card will return splits in the form of a pydantic model

In [11]:
splits = data_card.split_data()
splits.train.shape, splits.test.shape

((700, 11), (300, 11))

#### Save data to registry

In [12]:
registry.register_data(data_card=data_card)

{"level": "INFO", "message": "Table: synthetic_data registered as version 3", "timestamp": "2023-01-09T02:57:29.874955Z", "app_env": "staging", "host": null, "version": null}


In [14]:
# list data
registry_data = registry.list_data(data_name=DATA_NAME, team=TEAM, version=data_card.version)
assert data_card.uid == registry_data["uid"].values[0]

registry_data.head()

Unnamed: 0,uid,date,timestamp,app_env,data_name,team,data_uri,drift_uri,feature_map,data_splits,data_type,version,user_email,dependent_vars
0,d641649db0e64b0a9fcfa9c8178b90ef,2023-01-08,1673232819415,staging,synthetic_data,SPMS,gs://ds-opsml-stg/data_registry/SPMS/synthetic...,,"{'col_0': 'double', 'col_1': 'double', 'col_2'...","{'splits': [{'label': 'train', 'indices': [61,...",DataFrame,3,steven.forrester@shipt.com,


### Loading DataCards from the the registry

In [15]:
# load data_card
new_data_card = registry.load_data(data_name=DATA_NAME, team=TEAM)
assert new_data_card.uid == data_card.uid