In [1]:
# This notebook demonstrates creating a deeplake datset from Multi NLI dataset from hugging face

In [2]:
# First we will import the dataset from huggingface's website using the load_dataset function
from datasets import load_dataset
dataset = load_dataset("glue","mnli")

Found cached dataset glue (C:/Users/USER/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
# We will then use deep lake to initialize an empty deeplake dataset locally
import deeplake
import numpy as np
ds = deeplake.empty('./mnli_deeplake',overwrite=True) 

In [9]:
# Let us observe the mnli dataset.
# It is comprised of seperate dictionaries for training, testing and validating our model
# Let us observe the train part
dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [10]:
dataset['train']
# It contains some attributes like premise, hypothesis, label and idx
# We will extract premise and hypothesis data so that a model can be trained on those two attributes

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 392702
})

In [11]:
# Lets create the tensors and name them premise and hypothesis
with ds:
    ds.create_tensor("premise", htype="text")
    ds.create_tensor("hypothesis",htype='text')

In [12]:
from tqdm import tqdm # using tqdm gives us a loading bar that helps us keep track of iterations
# We will iterate through the list and append to Deep Lake dataset. This may take a while.
with ds:
    for i in tqdm(range(1,len(dataset['train']))):
        # Append the data into respective tensors
        ds.premise.append(dataset['train'][i]['premise'])
        ds.hypothesis.append(dataset['train'][i]['hypothesis'])

100%|████████████████████████████████████████████████████████████████████████| 392701/392701 [05:14<00:00, 1246.77it/s]


In [13]:
# Print the summary of dataset
ds.summary()

Dataset(path='./mnli_deeplake', tensors=['premise', 'hypothesis'])

   tensor     htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
  premise     text    (392701, 1)    str     None   
 hypothesis   text    (392701, 1)    str     None   


In [23]:
# This would help us visualize our dataset
ds.visualize()

In [17]:
# Inorder to use the data, we will convert tensors into numpy array
# Lets look at the first text in premise and hypothesis tensors
ds.premise.numpy()[0],

(array(['you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him'],
       dtype='<U1815'),)

In [18]:
ds.hypothesis.numpy()[0]

array(['You lose the things to the following level if the people recall.'],
      dtype='<U393')