#### Install `datasets` library

In [None]:
%%capture 

!pip install datasets

#### Imports 

In [1]:
import pandas as pd
import datasets
import logging

In [2]:
logger = logging.getLogger('__name__')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [3]:
logger.info(f'Using datasets version: {datasets.__version__}')

Using datasets version: 1.11.0


#### Loading a local CSV file

https://huggingface.co/docs/datasets/ <br>
https://huggingface.co/docs/datasets/loading_datasets.html

In [4]:
data = datasets.load_dataset('csv', data_files='./data/raw.csv', column_names=['text', 'label'], delimiter=',')
data

Using custom data configuration default-066394f1dbd0b6da
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-066394f1dbd0b6da/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

In [5]:
type(data)

datasets.dataset_dict.DatasetDict

#### Load a dataset from dict 

In [6]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [7]:
dataset = datasets.Dataset.from_dict(data)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

### Load a dataset from a Pandas dataframe

In [8]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,id,name,age
0,0,mary,24
1,1,bob,53
2,2,eve,19


In [10]:
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

### Saving to disk

#### Save to local disk

In [11]:
dataset.save_to_disk('./data/dataset')

In [13]:
reloaded_dataset = datasets.load_from_disk('./data/dataset')
reloaded_dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

#### Save to local as CSV

In [14]:
data = {'id': [0, 1, 2], 
        'name': ['mary', 'bob', 'eve'],  
        'age': [24, 53, 19]
       }

In [15]:
dataset = datasets.Dataset.from_dict(data)
dataset

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

In [19]:
dataset.to_csv('./data/dataset.csv', index=False)

40

#### Save to local as JSON

In [21]:
dataset.to_json('./data/dataset.json')

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




94