# operations

In [1]:
import datafun as dfn
import pydlib as dl

In [2]:
an_iterable = [(1,"The"), (2,"cat"), (3,"is"), (4,"on"), (5,"the"), (6,"table")]
ds = dfn.load(an_iterable)

## Basics (loop, show, collect)


In [3]:
# You can loop
for x in ds:
    print(x)

(1, 'The')
(2, 'cat')
(3, 'is')
(4, 'on')
(5, 'the')
(6, 'table')


In [4]:
# You can show elements
ds.show(3) # str

"(1, 'The'), (2, 'cat'), (3, 'is')"

In [5]:
# You can take elements
ds.take(3) # list

[(1, 'The'), (2, 'cat'), (3, 'is')]

In [6]:
ds.take_while(lambda x: x[0] <= 4)

[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on')]

In [7]:
# You can collect them all into a list
ds.collect()

Collecting examples: 6it [00:00, 1308.88it/s, total_read=6]


[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on'), (5, 'the'), (6, 'table')]

## filter

In [8]:
ds.filter(lambda x: x[1].lower() == "the").take(3)

[(1, 'The'), (5, 'the')]

## map


In [9]:
ds.map(lambda x: x[0]).take(3)

[1, 2, 3]

## flat_map

In [10]:
# Example 1: flatten original tuples
ds.flat_map().show(5)

'1, The, 2, cat, 3'

In [11]:
# Example 2: take a list from a dictionary with nested fields
ds2 = dfn.load([{'path': {'to': {'list': [1,2,3]}}}, {'path': {'to': {'list': [40,50,60]}}}])
# The list inside the dict is returned one element at a time
ds2.flat_map(lambda x: dl.get(x, "path.to.list")).collect()


Collecting examples: 6it [00:00, 4822.89it/s, total_read=2]


[1, 2, 3, 40, 50, 60]

## unique


In [13]:
# Filter duplicated tuples by string x[1]
ds.unique(lambda x: x[1].lower()).collect()

Collecting examples: 5it [00:00, 4806.67it/s, total_read=6]


[(1, 'The'), (2, 'cat'), (3, 'is'), (4, 'on'), (6, 'table')]

## sampling


In [15]:
# Sample dataset, then compute COUNT by turning the Dataset into a list
len(
    dfn.load(range(10000))
    .sampling(p=0.5, seed=1)
    .collect()
)

Collecting examples: 5023it [00:00, 56639.40it/s, total_read=10.0K]


5023

## aggregate

In [16]:
# Compute COUNT of sampled ds, as above, but with 'aggregate'
ds3 = (
    dfn
    .load(range(10000))
    .sampling(p=0.5, seed=1)
    .aggregate(
        init=lambda: 0,
        agg=lambda x, agg: agg+1, # x is the curr element (we ignore it), agg is the aggregate, same type from init
    )
)
ds3.take()

[5023]

In [17]:
# Compute SUM of first n natural numbers, then add 1 with a reduce
n = 100
print(
    dfn.load(range(1, n+1))\
        .aggregate(
            init=lambda: 0,
            agg=lambda curr, agg: curr+agg, # x[0] is the curr element, x[1] is the aggregate, same type from init
            reduce=lambda x: x+1
        )\
        .show()
)

sum_of_n = n*(n+1)/2
print(sum_of_n + 1) # To check correctness

5051
5051.0


# limit

In [19]:
# limit returns a Dataset, so it lets us loop over a very large ds like it is a small one
ds = (
    dfn
    .load(range(1000000000000000))
    .limit(5)
    .map(lambda x: x**2)
)
for x in ds:
    print(x)

0
1
4
9
16
