#### Install `datasets` library

In [1]:
%%capture 

!pip install datasets

#### Imports 

In [2]:
from datasets.filesystems import S3FileSystem
import pandas as pd
import sagemaker
import botocore
import datasets
import logging

In [3]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [4]:
logger.info(f'Using datasets version: {datasets.__version__}')

Using datasets version: 1.11.0


#### Loading a local CSV file

https://huggingface.co/docs/datasets/ <br>
https://huggingface.co/docs/datasets/loading_datasets.html

In [5]:
data = datasets.load_dataset('csv', data_files='./data/raw.csv', column_names=['text', 'label'], delimiter=',')
data

Using custom data configuration default-066394f1dbd0b6da
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-066394f1dbd0b6da/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

In [6]:
type(data)

datasets.dataset_dict.DatasetDict

#### Load a dataset from dict 

In [7]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [8]:
dataset = datasets.Dataset.from_dict(data)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

### Load a dataset from a Pandas dataframe

In [9]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [10]:
df = pd.DataFrame(data)
df

Unnamed: 0,id,name,age
0,0,mary,24
1,1,bob,53
2,2,eve,19


In [11]:
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

### Saving to disk

#### Save to local disk

In [12]:
dataset.save_to_disk('./data/dataset')

In [13]:
reloaded_dataset = datasets.load_from_disk('./data/dataset')
reloaded_dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

#### Save to local as CSV

In [14]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [15]:
dataset = datasets.Dataset.from_dict(data)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

In [16]:
dataset.to_csv('./data/dataset.csv', index=False)

40

#### Load the locally saved CSV file back as a dataset

In [17]:
data = datasets.load_dataset('csv', data_files='./data/dataset.csv', column_names=['text', 'label'], delimiter=',')
data

Using custom data configuration default-87f96bdc188f0b42


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/csv/default-87f96bdc188f0b42/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-87f96bdc188f0b42/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4
    })
})

#### Save to local as JSON

In [18]:
dataset.to_json('./data/dataset.json')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




94

#### Load the locally saved JSON file back as a dataset

In [19]:
data = datasets.load_dataset('json', data_files='./data/dataset.json')
data

Using custom data configuration default-bc11f913bbead297


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-bc11f913bbead297/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-bc11f913bbead297/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264. Subsequent calls will reuse this data.


DatasetDict({
    train: Dataset({
        features: ['id', 'name', 'age'],
        num_rows: 3
    })
})

### Saving and loading from S3 

In [20]:
sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket()
logger.info(f'Default bucket = {default_bucket}')

Default bucket = sagemaker-us-east-1-892313895307


#### Save and load to/from S3 

In [21]:
!aws s3 cp ./data/raw.csv s3://{default_bucket}/datasets/raw.csv

upload: data/raw.csv to s3://sagemaker-us-east-1-892313895307/datasets/raw.csv


In [22]:
s3_session = botocore.session.Session()
s3 = S3FileSystem(session=s3_session)

In [23]:
s3.ls(f'{default_bucket}/datasets/')  

['sagemaker-us-east-1-892313895307/datasets/my-dataset',
 'sagemaker-us-east-1-892313895307/datasets/raw.csv']

In [24]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [25]:
dataset = datasets.Dataset.from_dict(data)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

#### Save to S3 in Dataset format 

In [26]:
dataset.save_to_disk(f's3://{default_bucket}/datasets/my-dataset', fs=s3) 

**Note:** You can only load data that is already persisted in `Dataset` format from S3. You cannot load CSVs or other formatted files directly from S3 using datasets library.

In [27]:
dataset = datasets.load_from_disk(f's3://{default_bucket}/datasets/my-dataset', fs=s3) 
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})