# operations

In [2]:
import datafun as dfn
import pydlib as dl

In [2]:
an_iterable = [(1,"The"), (2,"cat"), (3,"is"), (4,"on"), (5,"the"), (6,"table")]
ds = dfn.load(an_iterable)

## Basics (loop, show, collect)


In [16]:
# You can loop
for x in ds:
    print(x)

(1, 'The')
(2, 'cat')
(3, 'is')
(4, 'on')
(5, 'the')
(6, 'table')


In [17]:
# You can show elements
ds.show(3) # str

"(1, 'The'), (2, 'cat'), (3, 'is')"

In [18]:
# You can take elements
ds.take(3) # list

[(1, 'The'), (2, 'cat'), (3, 'is')]

In [19]:
ds.take_while(lambda x: x[0] <= 4)

[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on')]

In [20]:
# You can collect them all into a list
ds.collect()

Collecting examples: 6it [00:00, 3750.50it/s, total_read=6]


[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on'), (5, 'the'), (6, 'table')]

## filter

In [21]:
ds.filter(lambda x: x[1].lower() == "the").take(3)

[(1, 'The'), (5, 'the')]

## map


In [22]:
ds.map(lambda x: x[0]).take(3)

[1, 2, 3]

## flat_map

In [23]:
# Example 1: flatten original tuples
ds.flat_map().show(5)

'1, The, 2, cat, 3'

In [24]:
# Example 2: take a list from a dictionary with nested fields
ds2 = dfn.load([{'path': {'to': {'list': [1,2,3]}}}, {'path': {'to': {'list': [40,50,60]}}}])
# The list inside the dict is returned one element at a time
ds2.flat_map(lambda x: dl.get(x, "path.to.list")).collect()


Collecting examples: 6it [00:00, 4378.95it/s, total_read=2]


[1, 2, 3, 40, 50, 60]

## unique


In [25]:
# Filter duplicated tuples by string x[1]
ds.unique(lambda x: x[1].lower()).collect()

Collecting examples: 5it [00:00, 4420.64it/s, total_read=6]


[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on'), (6, 'table')]

## sampling


In [26]:
# Sample dataset, then compute COUNT by turning the Dataset into a list
len(
    dfn.load(range(10000))
    .sampling(p=0.5, seed=1)
    .collect()
)

Collecting examples: 5023it [00:00, 74658.09it/s, total_read=10.0K]


5023

## aggregate

In [27]:
# Compute COUNT of sampled ds, as above, but with 'aggregate'
ds3 = (
    dfn
    .load(range(10000))
    .sampling(p=0.5, seed=1)
    .aggregate(
        init=lambda: 0,
        agg=lambda x, agg: agg+1, # x is the curr element (we ignore it), agg is the aggregate, same type from init
    )
)
ds3.take()

[5023]

In [28]:
# Compute SUM of first n natural numbers, then add 1 with a reduce
n = 100
print(
    dfn.load(range(1, n+1))\
        .aggregate(
            init=lambda: 0,
            agg=lambda curr, agg: curr+agg, # x[0] is the curr element, x[1] is the aggregate, same type from init
            reduce=lambda x: x+1
        )\
        .show()
)

sum_of_n = n*(n+1)/2
print(sum_of_n + 1) # To check correctness

5051
5051.0


# limit

In [29]:
# limit returns a Dataset, so it lets us loop over a very large ds like it is a small one
ds = (
    dfn
    .load(range(1000000000000000))
    .limit(5)
    .map(lambda x: x**2)
)
for x in ds:
    print(x)

0
1
4
9
16


# basic operations: +, -, /, *

In [10]:
ds_int = dfn.load(range(10))
ds_float = dfn.load([0.3, 0.1, 0.4, 0.3])
ds_str = dfn.load(["hello", "world"])

In [11]:
print(
    "ds_int      :", ds_int.take(10)
)
print(
    "ds_int   + 1:", (ds_int + 1).take(10)
)
print(
    "ds_int   * 2:", (ds_int * 2).take(10)
)
print(
    "ds_float    :", ds_float.take(10)
)
print(
    "ds_float / 2:", (ds_float / 2.0).take(10)
)

ds_int      : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ds_int   + 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ds_int   * 2: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
ds_float    : [0.3, 0.1, 0.4, 0.3]
ds_float / 2: [0.15, 0.05, 0.2, 0.15]


In [12]:
(ds_str + " the cat").take(10)

['hello the cat', 'world the cat']

In [16]:
(ds_int + ds_int).take(10)

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

# zip

In [7]:
(
    dfn.load(["he", "wo"])
    .zip(dfn.load(["llo", "rld"]))
).take(2)

[('he', 'llo'), ('wo', 'rld')]

In [2]:
ds = (
    dfn.load(["he", "wo"])
    .zip(dfn.load(["llo", "rld"]))
    .map(lambda x: x[0]+x[1])
)
# ds.root().iterables
ds.take(2)

['hello', 'world']

In [9]:
ds = (
    dfn.load(["he", "wo"])
    .zip(dfn.load(["llo", "rld"]))
    .aggregate(
        init=lambda: "",
        agg=lambda x, agg: agg+x[0]+x[1]
    )
)
# ds.root().iterables
ds.take(1)

['helloworld']

# join

In [1]:
# Join two datasets by key (full outer, the default)
import datafun as dfn
ds1 = dfn.load([
    {"id": "0", "name": "foo"},
    {"id": "1", "name": "bar"},
    {"id": "2", "name": "zoo"},
    {"id": "1", "name": "bar_dup"}
])
ds2 = dfn.load([
    {"id": "2", "name": "second_zoo"},
    {"id": "10", "name": "ten"}
])
ds1.join(other=ds2, key=lambda x: x['id'], type="full").take(10)

[{'0': [{'id': '0', 'name': 'foo'}],
  '1': [{'id': '1', 'name': 'bar'}, {'id': '1', 'name': 'bar_dup'}],
  '2': [{'id': '2', 'name': 'zoo'}, {'id': '2', 'name': 'second_zoo'}],
  '10': [{'id': '10', 'name': 'ten'}]}]

In [2]:
ds1.join(other=ds2, key=lambda x: x['id'], type='left').take(10)

[{'0': [{'id': '0', 'name': 'foo'}],
  '1': [{'id': '1', 'name': 'bar'}, {'id': '1', 'name': 'bar_dup'}],
  '2': [{'id': '2', 'name': 'zoo'}, {'id': '2', 'name': 'second_zoo'}]}]

In [3]:
ds1.join(other=ds2, key=lambda x: x['id'], type='right').take(10)

[{'2': [{'id': '2', 'name': 'second_zoo'}, {'id': '2', 'name': 'zoo'}],
  '10': [{'id': '10', 'name': 'ten'}]}]

In [4]:
ds1.join(other=ds2, key=lambda x: x['id'], type='inner').take(10)

[{'2': [{'id': '2', 'name': 'zoo'}, {'id': '2', 'name': 'second_zoo'}]}]

In [5]:
# Datasets have keys in different (possibly nested) paths
# warning! You can mix keys of different type
ds1 = dfn.load([
    {"root": {"chatId": "0", "name": "foo"}},
    {"root": {"chatId": "1", "name": "bar"}},
])
ds2 = dfn.load([
    {"sessionId": 0, "name": "alice"},
    {"sessionId": 13, "name": "bob"}
])
ds1.join(other=ds2, key_left=lambda x: x['root']['chatId'], key_right=lambda x: x['sessionId']).take(10)

[{'0': [{'root': {'chatId': '0', 'name': 'foo'}}],
  '1': [{'root': {'chatId': '1', 'name': 'bar'}}],
  0: [{'sessionId': 0, 'name': 'alice'}],
  13: [{'sessionId': 13, 'name': 'bob'}]}]

# cache
You can cache transformed data in memory

In [2]:
ds = dfn.load([1,2,3,4,5]).filter(lambda x: x%2==0).cache()
for x in ds:
    print(x)
print('--')
for x in ds:
    print(x)

2
4
--
2
4


In [24]:
import time
import datafun as dfn

def long_function(x):
    time.sleep(0.5)
    return x

ds = dfn.load([1,2,3,4,5,6,7,8])
ds = ds.map(lambda x: x*10)
ds = ds.map(lambda x: long_function(x))
ds = ds.cache()
ds = ds.filter(lambda x: x%20==0)

In [26]:
# First time you run it will execute long_function 5 times
for x in ds:
    print(x)

20
40
60
80


In [27]:
# Second time you run it will take the result from memory
for x in ds:
    print(x)

20
40
60
80


# repeat

In [5]:
ds = dfn.load([1,2,3]).repeat()

ds.take(10)

[1, 2, 3, 1, 2, 3, 1, 2, 3, 1]