In [1]:
import pandas
import dask.dataframe as daskDataFrame

person_IDs = [1,2,3,4,5,6,7,8,9,10]
person_last_names = ['Smith', 'Williams', 'Williams','Jackson','Johnson','Smith','Anderson','Christiansen','Carter','Davidson']
person_first_names = ['John', 'Bill', 'Jane','Cathy','Stuart','James','Felicity','Liam','Nancy','Christina']
person_DOBs = ['1982-10-06', '1990-07-04', '1989-05-06', '1974-01-24', '1995-06-05', '1984-04-16', '1976-09-15', '1992-10-02', '1986-02-05', '1993-08-11']

peoplePandasDataFrame = pandas.DataFrame({'Person ID':person_IDs, 
                                          'Last Name': person_last_names, 
                                          'First Name': person_first_names,
                                          'Date of Birth': person_DOBs},
                                         columns=['Person ID', 'Last Name', 'First Name', 'Date of Birth'])

peopleDaskDataFrame = daskDataFrame.from_pandas(peoplePandasDataFrame, npartitions=2)

In [2]:
print(peopleDaskDataFrame.compute())
print(peopleDaskDataFrame.divisions)
print(peopleDaskDataFrame.npartitions)

   Person ID     Last Name First Name Date of Birth
0          1         Smith       John    1982-10-06
1          2      Williams       Bill    1990-07-04
2          3      Williams       Jane    1989-05-06
3          4       Jackson      Cathy    1974-01-24
4          5       Johnson     Stuart    1995-06-05
5          6         Smith      James    1984-04-16
6          7      Anderson   Felicity    1976-09-15
7          8  Christiansen       Liam    1992-10-02
8          9        Carter      Nancy    1986-02-05
9         10      Davidson  Christina    1993-08-11
(0, 5, 9)
2


### map_partition # takes all row as 1 argument

Above shows a couple useful attributes of Dask DataFrames that can be used to inspect how a DataFrame is partitioned. The first attribute, divisions, (0, 5, 9), shows the boundaries of the partitioning scheme (remember that partitions are created on the index). This might look strange since there are two partitions but three boundaries. Each partition’s boundary consists of pairs of numbers from the list of divisions. The boundary for the first partition is “from 0 up to (but not including) 5,” meaning it will contain rows 0, 1, 2, 3, and 4. The boundary for the second partition is “from 5 through (and including) 9,” meaning it will contain rows 5, 6, 7, 8, and 9. The last partition always includes the upper boundary, whereas the other partitions go up to but don’t include their upper boundary.

In [44]:
peopleDaskDataFrame.map_partitions(len).compute()

0    5
1    5
dtype: int64

In [45]:
people_filtered = peopleDaskDataFrame[peopleDaskDataFrame['Last Name'] != 'Williams']

In [46]:
print(people_filtered.map_partitions(len).compute())

0    3
1    5
dtype: int64


In [47]:
print(type(people_filtered.map_partitions(len)))

<class 'dask.dataframe.core.Series'>


In [48]:
people_filtered.compute()

Unnamed: 0,Person ID,Last Name,First Name,Date of Birth
0,1,Smith,John,1982-10-06
3,4,Jackson,Cathy,1974-01-24
4,5,Johnson,Stuart,1995-06-05
5,6,Smith,James,1984-04-16
6,7,Anderson,Felicity,1976-09-15
7,8,Christiansen,Liam,1992-10-02
8,9,Carter,Nancy,1986-02-05
9,10,Davidson,Christina,1993-08-11


In [49]:
### applymap

In [41]:
dfa = pandas.DataFrame([[1, 2.12], [3.356, 4.567]])
print(type(dfa))
print(dfa)

<class 'pandas.core.frame.DataFrame'>
       0      1
0  1.000  2.120
1  3.356  4.567


In [45]:
def mySquare(x):
    print('x => ', x)
    print('type(x) => ', type(x))


In [46]:
dfa[1].map(mySquare)

x =>  2.12
type(x) =>  <class 'float'>
x =>  4.567
type(x) =>  <class 'float'>


0    None
1    None
Name: 1, dtype: object

In [47]:
dfa.applymap(mySquare)

x =>  1.0
type(x) =>  <class 'float'>
x =>  3.356
type(x) =>  <class 'float'>
x =>  2.12
type(x) =>  <class 'float'>
x =>  4.567
type(x) =>  <class 'float'>


Unnamed: 0,0,1
0,,
1,,


In [8]:
dfa.applymap(lambda x: len(str(x)))

Unnamed: 0,0,1
0,3,4
1,5,5


In [5]:
dfa[0]

0    1.000
1    3.356
Name: 0, dtype: float64

In [55]:
peopleDaskDataFrame.compute()

Unnamed: 0,Person ID,Last Name,First Name,Date of Birth
0,1,Smith,John,1982-10-06
1,2,Williams,Bill,1990-07-04
2,3,Williams,Jane,1989-05-06
3,4,Jackson,Cathy,1974-01-24
4,5,Johnson,Stuart,1995-06-05
5,6,Smith,James,1984-04-16
6,7,Anderson,Felicity,1976-09-15
7,8,Christiansen,Liam,1992-10-02
8,9,Carter,Nancy,1986-02-05
9,10,Davidson,Christina,1993-08-11


In [72]:
peopleDaskDataFrame = peopleDaskDataFrame.set_index('First Name')

In [76]:
peopleDaskDataFrame.compute()

Unnamed: 0_level_0,Person ID,Last Name,Date of Birth
First Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bill,2,Williams,1990-07-04
Cathy,4,Jackson,1974-01-24
Christina,10,Davidson,1993-08-11
Felicity,7,Anderson,1976-09-15
James,6,Smith,1984-04-16
Jane,3,Williams,1989-05-06
John,1,Smith,1982-10-06
Liam,8,Christiansen,1992-10-02
Nancy,9,Carter,1986-02-05
Stuart,5,Johnson,1995-06-05


In [84]:
count = 0
def myF(row):
    print("type(row) => ", type(row))
    print("row.dtypes => ", row.dtypes)
    print("row['Last Name'] => ", row['Last Name'])
    print("row => \n", row)

#     count = count+1
    return row['Person ID'] * 10
# a = peopleDaskDataFrame.applymap(myF)
a = peopleDaskDataFrame.apply(myF, axis=1)

type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  object
row['Last Name'] =>  foo
row => 
 Person ID          1
Last Name        foo
Date of Birth    foo
Name: a, dtype: object
type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  object
row['Last Name'] =>  foo
row => 
 Person ID          1
Last Name        foo
Date of Birth    foo
Name: b, dtype: object


You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'int64'))



In [86]:
a.nlargest(3).compute()

type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  object
row['Last Name'] =>  Williams
row => 
 Person ID                 2
Last Name          Williams
Date of Birth    1990-07-04
Name: Bill, dtype: object
type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  type(row) =>  <class 'pandas.core.series.Series'>object
row['Last Name'] =>  Jackson
row => 
 Person ID                 4
Last Name           Jackson
Date of Birth    1974-01-24
Name: Cathy, dtype: object
type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  object
row['Last Name'] =>  Davidson

row.dtypes =>  object
row['Last Name'] =>  Smithrow => 
 Person ID                10
Last Name          Davidson
Date of Birth    1993-08-11
Name: Christina, dtype: object
row => 
 Person ID                 6
Last Name             Smith
Date of Birth    1984-04-16
Name: James, dtype: object
type(row) =>  <class 'pandas.core.series.Series'>
row.dtypes =>  object
row['Last Name'] =>  Williams
row => 
 Per

First Name
Christina    100
Nancy         90
Liam          80
dtype: int64