In [None]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import dask
import dask.bag as db
import dask.array as da
import numpy as np
from dask import delayed
import dask.dataframe as dd
import pandas as pd

The dask.dataframe application programming interface (API) is a subset of the Pandas API it should be familiar to Pandas users. There are some slight alterations due to the parallel nature of dask.  As with all dask collections one triggers computation by calling the .compute() method.

Use when: 

situations where Pandas is commonly needed, but when Pandas fails due to data size or computation speed.

    Manipulating large datasets, even when those datasets don’t fit in memory
    Accelerating long computations by using many cores
    Distributed computing on large datasets with standard Pandas operations like groupby, join, and time series computations

`class dask.dataframe.DataFrame(dsk, name, meta, divisions)`

dask: dict

    The dask graph to compute this DataFrame

name: str

    The key prefix that specifies which keys in the dask comprise this particular DataFrame

meta: pandas.DataFrame

    An empty pandas.DataFrame with names, dtypes, and index matching the expected output.

divisions: tuple of index values

    Values along which we partition our blocks on the index

`abs()` return an object with absolute value taken-only applicable to objects that are all numeric  
`add(other, axis='columns', level=None, fill_value=None)` adddition of dataframe and other, element-wise (binary operator add). Equivalent to dataframe + other, but with support to substitute a fill_value for missing data in one of the inputs.

`align(other, join='outer', axis=None, fill_value=None)` allign two objects on their axes with the specified join method for each axis index

`append(other)` append rows of other to the end of this frame, returning a new object. Columns not in this frame are added as new columns.

In [17]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) 

In [18]:
ddf = dd.from_pandas(df, npartitions=2)

In [19]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))

In [26]:
ddf2 = dd.from_pandas(df2, npartitions=2)

In [30]:
ddf.compute(), ddf2.compute()

(   A  B
 0  1  2
 1  3  4,    A  B
 0  5  6
 1  7  8)

In [21]:
df.append(df2)

Unnamed: 0,A,B
0,1,2
1,3,4
0,5,6
1,7,8


In [11]:
df.append(df2, ignore_index=True)

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6
3,7,8


`apply(func, axis=0, args=(), meta='__no_default__', **kwds)` Only axis=1 is supported (and must be specified explicitly)

In [12]:
df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
                    'y': [1., 2., 3., 4., 5.]})
ddf = dd.from_pandas(df, npartitions=2)

In [13]:
def myadd(row, a, b=1):
    return row.sum() + a + b

In [16]:
res = ddf.apply(myadd, axis=1, args=(2,), b=1.5)
res.compute()

0     5.5
1     7.5
2     9.5
3    11.5
4    13.5
dtype: float64

`applymap(func, meta='__no_default__')` apply a function to a DataFrame that is intended to operate elementwise, i.e. like doing map(func, series) for each series in the DataFrame

In [34]:
df = pd.DataFrame(np.random.randn(3, 3))

In [35]:
df

Unnamed: 0,0,1,2
0,-0.877162,-0.715678,0.927947
1,1.098698,-0.358726,0.441249
2,-0.974668,-1.512826,-0.146479


In [36]:
df = df.applymap(lambda x: '%.2f' % x)  

In [37]:
df

Unnamed: 0,0,1,2
0,-0.88,-0.72,0.93
1,1.1,-0.36,0.44
2,-0.97,-1.51,-0.15


`assign(**kwargs)` assign new columns to a DataFrame, returning a new object (a copy) with all the original columns in addition to the new ones.

In [39]:
df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})  

In [40]:
df.assign(ln_A = lambda x: np.log(x.A))  

Unnamed: 0,A,B,ln_A
0,1,-0.992423,0.0
1,2,-0.597461,0.693147
2,3,1.970561,1.098612
3,4,-0.700434,1.386294
4,5,-0.652417,1.609438
5,6,1.218822,1.791759
6,7,-1.508239,1.94591
7,8,2.330752,2.079442
8,9,1.009,2.197225
9,10,-0.483622,2.302585


In [41]:
newcol = np.log(df['A'])
df.assign(ln_A=newcol) 

Unnamed: 0,A,B,ln_A
0,1,-0.992423,0.0
1,2,-0.597461,0.693147
2,3,1.970561,1.098612
3,4,-0.700434,1.386294
4,5,-0.652417,1.609438
5,6,1.218822,1.791759
6,7,-1.508239,1.94591
7,8,2.330752,2.079442
8,9,1.009,2.197225
9,10,-0.483622,2.302585


`astype(dtype)` cast a pandas object to a specified dtype 

`categorize(df, columns=None, index=None, split_every=None, **kwargs)` convert columns of the DataFrame to category dtype.  

    columns : list, optional

        A list of column names to convert to categoricals. By default any column with an object dtype is converted to a categorical, and any unknown categoricals are made known.

    index : bool, optional

        Whether to categorize the index. By default, object indices are converted to categorical, and unknown categorical indices are made known. Set True to always categorize the index, False to never.

    split_every : int, optional

        Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 16.

    kwargs

        Keyword arguments are passed on to compute.

`clip(lower=None, upper=None, out=None)` trim values at input threshold(s).

In [44]:
df.clip(-1.0, 5) 

Unnamed: 0,A,B
0,1.0,-0.992423
1,2.0,-0.597461
2,3.0,1.970561
3,4.0,-0.700434
4,5.0,-0.652417
5,5.0,1.218822
6,5.0,-1.0
7,5.0,2.330752
8,5.0,1.009
9,5.0,-0.483622


In [50]:
df.clip_lower(1)

Unnamed: 0,A,B
0,1.0,1.0
1,2.0,1.0
2,3.0,1.970561
3,4.0,1.0
4,5.0,1.0
5,6.0,1.218822
6,7.0,1.0
7,8.0,2.330752
8,9.0,1.009
9,10.0,1.0


In [53]:
df.clip_upper(4)

Unnamed: 0,A,B
0,1.0,-0.992423
1,2.0,-0.597461
2,3.0,1.970561
3,4.0,-0.700434
4,4.0,-0.652417
5,4.0,1.218822
6,4.0,-1.508239
7,4.0,2.330752
8,4.0,1.009
9,4.0,-0.483622


`combine(other, func, fill_value=None, overwrite=True)` add two DataFrame objects and do not propagate NaN values, so if for a (column, time) one frame is missing a value, it will default to the other frame’s value (which might be NaN as well)

In [56]:
df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})  
df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})  
df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)  

Unnamed: 0,A,B
0,0,3
1,0,3


`combine_first(other)` combine two DataFrame objects and default to non-null values in frame calling the method. Result index columns will be the union of the respective indexes and columns

In [57]:
df1 = pd.DataFrame([[1, np.nan]])  
df2 = pd.DataFrame([[3, 4]])  
df1.combine_first(df2)

Unnamed: 0,0,1
0,1,4.0


`compute(**kwargs)` compute the dask collection.  this turns a lazy Dask collection into its in-memory equivalent. For example a Dask.array turns into a numpy.array() and a Dask.dataframe turns into a Pandas dataframe. The entire dataset must fit into memory before calling this operation.

`copy()` make a copy of the dataframe (shallow)

`corr(method='pearson', min_periods=None, split_every=False)` compute pairwise correlation of columns, excluding NA/null values.  method : {‘pearson’, ‘kendall’, ‘spearman’}

`count(axis=None, split_every=False)` return Series with number of non-NA/null observations over requested axis. Works with non-floating point data as well (detects NaN and None)

`cov(min_periods=None, split_every=False)` compute pairwise covariance of columns, excluding NA/null values

`cummax(axis=None, skipna=True)` return cumulative max over requested axis.  
`cummin(axis=None, skipna=True)`  
`cumprod(axis=None, skipna=True)`  
`cumsum(axis=None, skipna=True)`  

`describe(split_every=False)` 

`diff(periods=1, axis=0)` 1st discrete difference of object

`div(other, axis='columns', level=None, fill_value=None)` floating division of dataframe and other, element-wise (binary operator truediv).  equivalent to dataframe / other, but with support to substitute a fill_value for missing data in one of the inputs.

`drop(labels, axis=0, errors='raise')` returns a new object with labels in requested axis removed.

In [65]:
df = pd.DataFrame(np.arange(12).reshape(3,4), 
                  columns=['A', 'B', 'C', 'D'])

In [66]:
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


`drop_duplicates(split_every=None, split_out=1, **kwargs)` return DataFrame with duplicate rows removed, optionally only considering certain columns.  

    subset : column label or sequence of labels, optional

        Only consider certain columns for identifying duplicates, by default use all of the columns

    keep : {‘first’, ‘last’, False}, default ‘first’

        first : Drop duplicates except for the first occurrence.
        last : Drop duplicates except for the last occurrence.
        False : Drop all duplicates.
    
    inplace : boolean, default False

        Whether to drop duplicates in place or to return a copy

`dropna(how='any', subset=None)` return object with labels on given axis omitted where alternately any or all of the data are missing

In [67]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],  
                   [np.nan, np.nan, np.nan, 5]], columns=list('ABCD'))

In [68]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [71]:
df.dropna(axis=1, how='all') # col drop

Unnamed: 0,A,B,D
0,,2.0,0
1,3.0,4.0,1
2,,,5


In [72]:
df.dropna(axis=1, how='any') 

Unnamed: 0,D
0,0
1,1
2,5


In [73]:
df.dropna(axis=0, how='all') # row drop

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [74]:
df.dropna(thresh=2) # keep only rows with at least 2 non-na values

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1


`eq(other, axis='columns', level=None)` wrapper for flexible comparison methods

`eval(expr, inplace=None, **kwargs)` evaluate an expression in the context of the calling DataFrame instance

In [77]:
df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab')) 
df.eval('a + b')

0    1.054118
1    0.260768
2   -0.592560
3   -0.035097
4   -0.357299
5   -0.775453
6   -0.500414
7    1.351187
8    0.963988
9    1.337153
dtype: float64

`fillna(value=None, method=None, limit=None, axis=None)` fill NA/NaN values using the specified method where `method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None`

In [78]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],  
                   [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5],
                   [np.nan, 3, np.nan, 4]],
                    columns=list('ABCD'))

In [80]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [79]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


In [81]:
df.fillna(method='ffill') # fill forward (propogate)

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


In [82]:
df.fillna(method='bfill') # fill backwards

Unnamed: 0,A,B,C,D
0,3.0,2.0,,0
1,3.0,4.0,,1
2,,3.0,,5
3,,3.0,,4


In [86]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}  #replace all NaN elements in column ‘A’, ‘B’, ‘C’, 
                                           #and ‘D’, with 0, 1, 2, and 3 respectively.
df.fillna(value=values)  

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
3,0.0,3.0,2.0,4


In [87]:
df.fillna(value=values, limit=1)# only replace first NaN

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,,1
2,,1.0,,5
3,,3.0,,4


`first(offset)` convenience method for subsetting initial periods of time series data based on a date offset.
`ts.first('10D') -> First 10 days`

`floordiv(other, axis='columns', level=None, fill_value=None)` integer division of dataframe and other, element-wise (binary operator floordiv). equivalent to `dataframe // other`, but with support to substitute a `fill_value` for missing data in one of the inputs.

`get_partition(n)` get a dask DataFrame/Series representing the nth partition.



`groupby(by=None, **kwargs)` group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns.

	
by : mapping, function, str, or iterable

    Used to determine the groups for the groupby. If by is a function, it’s called on each value of the object’s index. If a dict or Series is passed, the Series or dict VALUES will be used to determine the groups (the Series’ values are first aligned; see .align() method). If an ndarray is passed, the values are used as-is determine the groups. A str or list of strs may be passed to group by the columns in self

axis : int, default 0

level : int, level name, or sequence of such, default None

    If the axis is a MultiIndex (hierarchical), group by a particular level or levels

as_index : boolean, default True

    For aggregated output, return object with group labels as the index. Only relevant for DataFrame input. as_index=False is effectively “SQL-style” grouped output

sort : boolean, default True

    Sort group keys. Get better performance by turning this off. Note this does not influence the order of observations within each group. groupby preserves the order of rows within each group.

group_keys : boolean, default True

    When calling apply, add group keys to index to identify pieces

squeeze : boolean, default False

    reduce the dimensionality of the return type if possible, otherwise return a consistent type

`head(n=5, npartitions=1, compute=True)` first n rows of the dataset



`idxmax(axis=None, skipna=True, split_every=False)` return index of first occurrence of maximum over requested axis. NA/null values are excluded. 

`idxmin(axis=None, skipna=True, split_every=False)` return index of first occurrence of minimum over requested axis. NA/null values are excluded.

`info(buf=None, verbose=False, memory_usage=False)` summary of Dask DataFrame

`isin(values)` reeturn boolean DataFrame showing whether each element in the DataFrame is contained in values.

In [91]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})  # when values are a list
df.isin([1, 3, 12, 'a'])

Unnamed: 0,A,B
0,True,True
1,False,False
2,True,False


In [92]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]}) # when values are a dict
df.isin({'A': [1, 3], 'B': [4, 7, 12]})  

Unnamed: 0,A,B
0,True,False
1,False,True
2,True,True


In [93]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})  # when values is a Series or DataFrame:
other = pd.DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
df.isin(other)

Unnamed: 0,A,B
0,True,False
1,False,False
2,True,True


`iterrows()` iterate over DataFrame rows as (index, Series) pairs.

In [95]:
df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) 
row = next(df.iterrows())[1]
df

Unnamed: 0,int,float
0,1,1.5


In [96]:
row

int      1.0
float    1.5
Name: 0, dtype: float64

`itertuples()` iterate over DataFrame rows as namedtuples, with index value as first element of the tuple.

In [97]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]}, 
                  index=['a', 'b'])
df

Unnamed: 0,col1,col2
a,1,0.1
b,2,0.2


In [98]:
for row in df.itertuples():  
    print(row)

Pandas(Index='a', col1=1, col2=0.1)
Pandas(Index='b', col1=2, col2=0.2)


In [99]:
for row in df.iterrows():
    print(row)

('a', col1    1.0
col2    0.1
Name: a, dtype: float64)
('b', col1    2.0
col2    0.2
Name: b, dtype: float64)


`join(other, on=None, how='left', lsuffix='', rsuffix='', npartitions=None, shuffle=None)` join columns with other DataFrame either on index or on a key column. Efficiently Join multiple DataFrame objects by index at once by passing a list.

other : DataFrame, Series with name field set, or list of DataFrame

    Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame

on : column name, tuple/list of column names, or array-like

    Column(s) in the caller to join on the index in other, otherwise joins index-on-index. If multiples columns given, the passed DataFrame must have a MultiIndex. Can pass an array as the join key if not already contained in the calling DataFrame. Like an Excel VLOOKUP operation

how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default: ‘left’

    How to handle the operation of the two objects.

        left: use calling frame’s index (or column if on is specified)
        right: use other frame’s index
        outer: form union of calling frame’s index (or column if on is specified) with other frame’s index, and sort it lexicographically
        inner: form intersection of calling frame’s index (or column if on is specified) with other frame’s index, preserving the order of the calling’s one
lsuffix : string

    Suffix to use from left frame’s overlapping columns

rsuffix : string

    Suffix to use from right frame’s overlapping columns

sort : boolean, default False

    Order result DataFrame lexicographically by the join key. If False, the order of the join key depends on the join type (how keyword)

In [100]:
caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                       'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
caller

Unnamed: 0,A,key
0,A0,K0
1,A1,K1
2,A2,K2
3,A3,K3
4,A4,K4
5,A5,K5


In [102]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                      'B': ['B0', 'B1', 'B2']})
other

Unnamed: 0,B,key
0,B0,K0
1,B1,K1
2,B2,K2


In [103]:
caller.join(other, lsuffix='_caller', rsuffix='_other') # join using indexes

Unnamed: 0,A,key_caller,B,key_other
0,A0,K0,B0,K0
1,A1,K1,B1,K1
2,A2,K2,B2,K2
3,A3,K3,,
4,A4,K4,,
5,A5,K5,,


In [104]:
#If we want to join using the key columns, we need to set 
#key to be the index in both caller and other. The joined DataFrame will have key as its index.
caller.set_index('key').join(other.set_index('key'))  

Unnamed: 0_level_0,A,B
key,Unnamed: 1_level_1,Unnamed: 2_level_1
K0,A0,B0
K1,A1,B1
K2,A2,B2
K3,A3,
K4,A4,
K5,A5,


In [106]:
#nother option to join using the key columns is to use the on parameter. 
#DataFrame.join always uses other’s index but we can use any column in the caller. 
#This method preserves the original caller’s index in the result.
caller.join(other.set_index('key'), on='key')  

Unnamed: 0,A,key,B
0,A0,K0,B0
1,A1,K1,B1
2,A2,K2,B2
3,A3,K3,
4,A4,K4,
5,A5,K5,


`last(offset)` convenience method for subsetting final periods of time series data based on a date offset.
`ts.last('5M') -> Last 5 months`

`loc` purely label-location based indexer for selection by label.

`map_overlap(func, before, after, *args, **kwargs)` apply a function to each partition, sharing rows with adjacent partitions. this can be useful for implementing windowing functions such as `df.rolling(...).mean()` or `df.diff()`.

In [110]:
df = pd.DataFrame({'x': [1, 2, 4, 7, 11],
                   'y': [1., 2., 3., 4., 5.]})
ddf = dd.from_pandas(df, npartitions=2)

In [112]:
ddf.compute()

Unnamed: 0,x,y
0,1,1.0
1,2,2.0
2,4,3.0
3,7,4.0
4,11,5.0


a rolling sum with a trailing moving window of size 2 can be computed by overlapping 2 rows before each partition, and then mapping calls to `df.rolling(2).sum()`

In [113]:
ddf.map_overlap(lambda df: df.rolling(2).sum(), 2, 0).compute()

Unnamed: 0,x,y
0,,
1,3.0,3.0
2,6.0,5.0
3,11.0,7.0
4,18.0,9.0


In [114]:
ddf.map_overlap(lambda df: df.rolling(2).mean(), 2, 0).compute()

Unnamed: 0,x,y
0,,
1,1.5,1.5
2,3.0,2.5
3,5.5,3.5
4,9.0,4.5


The pandas `diff` method computes a discrete difference shifted by a number of periods (can be positive or negative). This can be implemented by mapping calls to `df.diff` to each partition after prepending/appending that many rows, depending on sign:

In [115]:
def diff(df, periods=1):
    before, after = (periods, 0) if periods > 0 else (0, -periods)
    return df.map_overlap(lambda df, periods=1: df.diff(periods),
                          periods, 0, periods=periods)

In [116]:
diff(ddf, 1).compute()

Unnamed: 0,x,y
0,,
1,1.0,1.0
2,2.0,1.0
3,3.0,1.0
4,4.0,1.0


`map_partitions(func, *args, **kwargs)` apply Python function on each DataFrame partition.

In [117]:
df = pd.DataFrame({'x': [1, 2, 3, 4, 5],
                   'y': [1., 2., 3., 4., 5.]})
ddf = dd.from_pandas(df, npartitions=2)

one can use map_partitions to apply a function on each partition. Extra arguments and keywords can optionally be provided, and will be passed to the function after the partition.

Here we apply a function with arguments and keywords to a DataFrame, resulting in a Series:

In [118]:
def myadd(df, a, b=1):
    return df.x + df.y + a + b
res = ddf.map_partitions(myadd, 1, b=2)

In [122]:
ddf.compute()

Unnamed: 0,x,y
0,1,1.0
1,2,2.0
2,3,3.0
3,4,4.0
4,5,5.0


In [120]:
res.compute()

0     5.0
1     7.0
2     9.0
3    11.0
4    13.0
dtype: float64

In [123]:
res = ddf.map_partitions(lambda df: df.assign(z=df.x * df.y))

In [125]:
res.compute() #map a function that takes in a DataFrame, and returns a DataFrame with a new column:

Unnamed: 0,x,y,z
0,1,1.0,1.0
1,2,2.0,4.0
2,3,3.0,9.0
3,4,4.0,16.0
4,5,5.0,25.0


`mask(cond, other=nan)` return an object of same shape as self and whose corresponding entries are from self where cond is False and otherwise are from other.

In [126]:
s = pd.Series(range(5))  
s.where(s > 0)

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [127]:
s.mask(s > 0)

0    0.0
1    NaN
2    NaN
3    NaN
4    NaN
dtype: float64

In [128]:
s.where(s > 1, 10)

0    10
1    10
2     2
3     3
4     4
dtype: int64

`max(axis=None, skipna=True, split_every=False)`  returns the maximum of the values in the object.

`mean(axis=None, skipna=True, split_every=False)` return the mean of the values for the requested axis

`merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, suffixes=('_x', '_y'), indicator=False, npartitions=None, shuffle=None)` merge DataFrame objects by performing a database-style join operation by columns or indexes.  If joining columns on columns, the DataFrame indexes will be ignored. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on.

	
right : DataFrame

how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’

    left: use only keys from left frame, similar to a SQL left outer join; preserve key order
    right: use only keys from right frame, similar to a SQL right outer join; preserve key order
    outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically
    inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys
on : label or list

    Field names to join on. Must be found in both DataFrames. If on is None and not merging on indexes, then it merges on the intersection of the columns by default.

left_on : label or list, or array-like

    Field names to join on in left DataFrame. Can be a vector or list of vectors of the length of the DataFrame to use a particular vector as the join key instead of columns

right_on : label or list, or array-like

    Field names to join on in right DataFrame or vector/list of vectors per left_on docs

left_index : boolean, default False

    Use the index from the left DataFrame as the join key(s). If it is a MultiIndex, the number of keys in the other DataFrame (either the index or a number of columns) must match the number of levels

right_index : boolean, default False

    Use the index from the right DataFrame as the join key. Same caveats as left_index

sort : boolean, default False

    Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword)

suffixes : 2-length sequence (tuple, list, …)

    Suffix to apply to overlapping column names in the left and right side, respectively

copy : boolean, default True

    If False, do not copy data unnecessarily

indicator : boolean or string, default False

    If True, adds a column to output DataFrame called “_merge” with information on the source of each row. If string, column with information on source of each row will be added to output DataFrame, and column will be named value of string. Information column is Categorical-type and takes on a value of “left_only” for observations whose merge key only appears in ‘left’ DataFrame, “right_only” for observations whose merge key only appears in ‘right’ DataFrame, and “both” if the observation’s merge key is found in both.


validate : string, default None

    If specified, checks if merge is of specified type.

    “one_to_one” or “1:1”: check if merge keys are unique in both left and right datasets.
    “one_to_many” or “1:m”: check if merge keys are unique in left dataset.
    “many_to_one” or “m:1”: check if merge keys are unique in right dataset.
    “many_to_many” or “m:m”: allowed, but does not result in checks.

`min(axis=None, skipna=True, split_every=False)`  returns the minimum of the values in the object.

`mod(other, axis='columns', level=None, fill_value=None)` modulo of dataframe and other, element-wise (binary operator mod).  equivalent to `dataframe % other`, but with support to substitute a fill_value for missing data in one of the inputs.

`mul(other, axis='columns', level=None, fill_value=None)` multiplication of dataframe and other, element-wise (binary operator mul).

`nlargest(n=5, columns=None, split_every=None)` return the rows of a DataFrame sorted by the n largest values of columns.

In [131]:
df = pd.DataFrame({'a': [1, 10, 8, 11, -1],  
                'b': list('abdce'),
                'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
df.nlargest(3, 'a')  

Unnamed: 0,a,b,c
3,11,c,3.0
1,10,b,2.0
2,8,d,


`nsmallest(n=5, columns=None, split_every=None)` return rows of a DataFrame sorted by the n smallest values of columns.

In [132]:
df.nsmallest(3, 'a')

Unnamed: 0,a,b,c
4,-1,e,4.0
0,1,a,1.0
2,8,d,


`nunique_approx(split_every=None)` approximate number of unique rows. this method uses the HyperLogLog algorithm for cardinality estimation to compute the approximate number of unique rows. The approximate error is 0.406%.

`persist(**kwargs)` persist this dask collection into memory.  this turns a lazy Dask collection into a Dask collection with the same metadata, but now with the results fully computed or actively computing in the background.

`pipe(func, *args, **kwargs)` apply func(self, *args, **kwargs)

`pivot_table(index=None, columns=None, values=None, aggfunc='mean')` create a spreadsheet-style pivot table as a DataFrame. Target columns must have category dtype to infer result’s columns. `index`, `columns`, `values` and `aggfunc` must be all scalar.  where `aggfunc : {‘mean’, ‘sum’, ‘count’}, default ‘mean’`

`pow(other, axis='columns', level=None, fill_value=None)` exponential power of dataframe and other, element-wise (binary operator pow).

`prod(axis=None, skipna=True, split_every=False)` return the product of the values for the requested axis

`quantile(q=0.5, axis=0)` approximate row-wise and precise column-wise quantiles of DataFrame

`query(expr, **kwargs)` filter dataframe with complex expression

`radd(other, axis='columns', level=None, fill_value=None)` addition of dataframe and other, element-wise (binary operator radd).

`random_split(frac, random_state=None)` pseudorandomly split dataframe into different pieces row-wise

In [148]:
a, b = ddf.random_split([0.5, 0.5])  

In [149]:
a.compute()

Unnamed: 0,x,y
1,2,2.0
3,4,4.0


In [150]:
b.compute()

Unnamed: 0,x,y
0,1,1.0
2,3,3.0
4,5,5.0


In [177]:
a, b, c = ddf.random_split([0.6, 0.2, 0.2], random_state=2)  

In [178]:
a.compute()

Unnamed: 0,x,y
0,1,1.0
1,2,2.0
3,4,4.0
4,5,5.0


In [179]:
b.compute()

Unnamed: 0,x,y
2,3,3.0


In [180]:
c.compute()

Unnamed: 0,x,y


`rdiv(other, axis='columns', level=None, fill_value=None)` floating division of dataframe and other, element-wise (binary operator rtruediv).

`reduction(chunk, aggregate=None, combine=None, meta='__no_default__', token=None, split_every=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, **kwargs)` row-wise reductions.
chunk : callable

    Function to operate on each partition. Should return a pandas.DataFrame, pandas.Series, or a scalar.

aggregate : callable, optional

    Function to operate on the concatenated result of chunk. If not specified, defaults to chunk. Used to do the final aggregation in a tree reduction.

    The input to aggregate depends on the output of chunk. If the output of chunk is a:

        scalar: Input is a Series, with one row per partition.
        Series: Input is a DataFrame, with one row per partition. Columns are the rows in the output series.
        DataFrame: Input is a DataFrame, with one row per partition. Columns are the columns in the output dataframes.
        Should return a pandas.DataFrame, pandas.Series, or a scalar.

combine : callable, optional

    Function to operate on intermediate concatenated results of chunk in a tree-reduction. If not provided, defaults to aggregate. The input/output requirements should match that of aggregate described above.

meta : pd.DataFrame, pd.Series, dict, iterable, tuple, optional

    An empty pd.DataFrame or pd.Series that matches the dtypes and column names of the output. This metadata is necessary for many algorithms in dask dataframe to work. For ease of use, some alternative inputs are also available. Instead of a DataFrame, a dict of {name: dtype} or iterable of (name, dtype) can be provided. Instead of a series, a tuple of (name, dtype) can be used. If not provided, dask will try to infer the metadata. This may lead to unexpected results, so providing meta is recommended. For more information, see dask.dataframe.utils.make_meta.

token : str, optional

    The name to use for the output keys.

split_every : int, optional

    Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to aggregate. Default is 8.

chunk_kwargs : dict, optional

    Keyword arguments to pass on to chunk only.

aggregate_kwargs : dict, optional

    Keyword arguments to pass on to aggregate only.

combine_kwargs : dict, optional

    Keyword arguments to pass on to combine only.

kwargs :

    All remaining keywords will be passed to chunk, combine, and aggregate.

In [181]:
df = pd.DataFrame({'x': range(50), 'y': range(50, 100)})

In [183]:
ddf = dd.from_pandas(df, npartitions=4)

In [189]:
ddf

Unnamed: 0_level_0,x,y
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1
0,int64,int64
13,...,...
26,...,...
39,...,...
49,...,...


In [185]:
#count the number of rows in a DataFrame. To do this, count the number of 
#rows in each partition, then sum the results:
res = ddf.reduction(lambda x: x.count(),
                    aggregate=lambda x: x.sum())

In [186]:
res.compute()

x    50
y    50
dtype: int64

In [190]:
#Count the number of rows in a Series with elements greater than or equal to a value (provided via a keyword).
def count_greater(x, value=0):
    return (x >= value).sum()
res = ddf.x.reduction(count_greater, aggregate=lambda x: x.sum(), # series (col) x
                      chunk_kwargs={'value': 25})
res.compute()

25

In [192]:
res = ddf.y.reduction(count_greater, aggregate=lambda x: x.sum(), # series (col) y
                      chunk_kwargs={'value': 25})
res.compute()

50

In [193]:
# Aggregate both the sum and count of a Series at the same time:
def sum_and_count(x):
    return pd.Series({'sum': x.sum(), 'count': x.count()})
res = ddf.x.reduction(sum_and_count, aggregate=lambda x: x.sum())
res.compute()

count      50
sum      1225
dtype: int64

In [194]:
res = ddf.y.reduction(sum_and_count, aggregate=lambda x: x.sum())
res.compute()

count      50
sum      3725
dtype: int64

Doing the same, but for a DataFrame. Here chunk returns a DataFrame, meaning the input to aggregate is a DataFrame with an index with non-unique entries for both ‘x’ and ‘y’. We groupby the index, and sum each group to get the final result.

In [196]:
def sum_and_count(x):
    return pd.DataFrame({'sum': x.sum(), 'count': x.count()})

res = ddf.reduction(sum_and_count,
                    aggregate=lambda x: x.groupby(level=0).sum())
res.compute()

Unnamed: 0,count,sum
x,50,1225
y,50,3725


`rename(index=None, columns=None)` alter axis labels.  function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don’t throw an error.

mapper, index, columns : dict-like or function, optional

    dict-like or functions transformations to apply to that axis’ values. Use either mapper and axis to specify the axis to target with mapper, or index and columns.

axis : int or str, optional

    Axis to target with mapper. Can be either the axis name (‘index’, ‘columns’) or number (0, 1). The default is ‘index’.

copy : boolean, default True

    Also copy underlying data

inplace : boolean, default False

    Whether to return a new %(klass)s. If True then value of copy is ignored.

level : int or level name, default None

    In case of a MultiIndex, only rename labels in the specified level.

In [197]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})  
df.rename(index=str, columns={"A": "a", "B": "c"})  

Unnamed: 0,a,c
0,1,4
1,2,5
2,3,6


In [198]:
df.rename(index=str, columns={"A": "a", "C": "c"})  

Unnamed: 0,a,B
0,1,4
1,2,5
2,3,6


In [199]:
df.rename(str.lower, axis='columns')  

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [200]:
df.rename({1: 2, 2: 4}, axis='index')  

Unnamed: 0,A,B
0,1,4
2,2,5
4,3,6


`repartition(divisions=None, npartitions=None, freq=None, force=False)` repartition dataframe along new divisions

`resample(rule, how=None, closed=None, label=None)` convenience method for frequency conversion and resampling of time series. Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex), or pass datetime-like values to the on or level keyword.

rule : string

    the offset string or object representing target conversion

axis : int, optional, default 0

closed : {‘right’, ‘left’}

    Which side of bin interval is closed. The default is ‘left’ for all frequency offsets except for ‘M’, ‘A’, ‘Q’, ‘BM’, ‘BA’, ‘BQ’, and ‘W’ which all have a default of ‘right’.

label : {‘right’, ‘left’}

    Which bin edge label to label bucket with. The default is ‘left’ for all frequency offsets except for ‘M’, ‘A’, ‘Q’, ‘BM’, ‘BA’, ‘BQ’, and ‘W’ which all have a default of ‘right’.

convention : {‘start’, ‘end’, ‘s’, ‘e’}

    For PeriodIndex only, controls whether to use the start or end of rule

loffset : timedelta

    Adjust the resampled time labels

base : int, default 0

    For frequencies that evenly subdivide 1 day, the “origin” of the aggregated intervals. For example, for ‘5min’ frequency, base could range from 0 through 4. Defaults to 0

on : string, optional

    For a DataFrame, column to use instead of index for resampling. Column must be datetime-like.

level : string or int, optional

    For a MultiIndex, level (name or number) to use for resampling. Level must be datetime-like.


In [201]:
index = pd.date_range('1/1/2000', periods=9, freq='T') 

In [202]:
index

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
               '2000-01-01 00:02:00', '2000-01-01 00:03:00',
               '2000-01-01 00:04:00', '2000-01-01 00:05:00',
               '2000-01-01 00:06:00', '2000-01-01 00:07:00',
               '2000-01-01 00:08:00'],
              dtype='datetime64[ns]', freq='T')

In [203]:
series = pd.Series(range(9), index=index)

In [204]:
series

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

In [205]:
#downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin.
series.resample('3T').sum()

2000-01-01 00:00:00     3
2000-01-01 00:03:00    12
2000-01-01 00:06:00    21
Freq: 3T, dtype: int64

downsample the series into 3 minute bins as above, but label each bin using the right edge instead of the left. Please note that the value in the bucket used as the label is not included in the bucket, which it labels. For example, in the original series the bucket 2000-01-01 00:03:00 contains the value 3, but the summed value in the resampled bucket with the label 2000-01-01 00:03:00 does not include 3 (if it did, the summed value would be 6, not 3). To include this value close the right side of the bin interval as illustrated in the example below this one.

In [206]:
series.resample('3T', label='right').sum()  

2000-01-01 00:03:00     3
2000-01-01 00:06:00    12
2000-01-01 00:09:00    21
Freq: 3T, dtype: int64

In [207]:
# downsample the series into 3 minute bins as above, but close the right side of the bin interval.
series.resample('3T', label='right', closed='right').sum()  

2000-01-01 00:00:00     0
2000-01-01 00:03:00     6
2000-01-01 00:06:00    15
2000-01-01 00:09:00    15
Freq: 3T, dtype: int64

In [208]:
# upsample the sries into 30 second bins
series.resample('30S').asfreq()[0:5] #select first 5 rows  

2000-01-01 00:00:00    0.0
2000-01-01 00:00:30    NaN
2000-01-01 00:01:00    1.0
2000-01-01 00:01:30    NaN
2000-01-01 00:02:00    2.0
Freq: 30S, dtype: float64

In [209]:
#upsample the series into 30 second bins and fill the NaN values using the pad method.
series.resample('30S').pad()[0:5]  

2000-01-01 00:00:00    0
2000-01-01 00:00:30    0
2000-01-01 00:01:00    1
2000-01-01 00:01:30    1
2000-01-01 00:02:00    2
Freq: 30S, dtype: int64

In [211]:
#upsample the series into 30 second bins and fill the NaN values using the bfill method.
series.resample('30S').bfill()[0:5]  

2000-01-01 00:00:00    0
2000-01-01 00:00:30    1
2000-01-01 00:01:00    1
2000-01-01 00:01:30    2
2000-01-01 00:02:00    2
Freq: 30S, dtype: int64

In [212]:
# pass a custom function with apply
def custom_resampler(array_like):  
    return np.sum(array_like) + 5

In [213]:
series.resample('3T').apply(custom_resampler)  

2000-01-01 00:00:00     8
2000-01-01 00:03:00    17
2000-01-01 00:06:00    26
Freq: 3T, dtype: int64

for a Series with a PeriodIndex, the keyword convention can be used to control whether to use the start or end of rule.

In [214]:
s = pd.Series([1, 2], index=pd.period_range('2012-01-01', 
                                            freq='A',
                                            periods=2))

In [215]:
s

2012    1
2013    2
Freq: A-DEC, dtype: int64

In [216]:
#resample by month using ‘start’ convention. Values are assigned to the first month of the period.
s.resample('M', convention='start').asfreq().head()  

2012-01    1.0
2012-02    NaN
2012-03    NaN
2012-04    NaN
2012-05    NaN
Freq: M, dtype: float64

In [217]:
#resample by month using ‘end’ convention. Values are assigned to the last month of the period.
s.resample('M', convention='end').asfreq()  

2012-12    1.0
2013-01    NaN
2013-02    NaN
2013-03    NaN
2013-04    NaN
2013-05    NaN
2013-06    NaN
2013-07    NaN
2013-08    NaN
2013-09    NaN
2013-10    NaN
2013-11    NaN
2013-12    2.0
Freq: M, dtype: float64

for DataFrame objects, the keyword on can be used to specify the column instead of the index for resampling.

In [218]:
df = pd.DataFrame(data=9*[range(4)], columns=['a', 'b', 'c', 'd'])
df['time'] = pd.date_range('1/1/2000', periods=9, freq='T')  
df.resample('3T', on='time').sum()  

Unnamed: 0_level_0,a,b,c,d
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,0,3,6,9
2000-01-01 00:03:00,0,3,6,9
2000-01-01 00:06:00,0,3,6,9


`reset_index(drop=False)` reset the index to the default index.  unlike in pandas, the reset dask.dataframe index will not be monotonically increasing from 0. Instead, it will restart at 0 for each partition (e.g. `index1 = [0, ..., 10], index2 = [0, ...]`). This is due to the inability to statically know the full length of the index.

`rfloordiv(other, axis='columns', level=None, fill_value=None)` integer division of dataframe and other, element-wise (binary operator rfloordiv).

`rmod(other, axis='columns', level=None, fill_value=None)` modulo of dataframe and other, element-wise (binary operator rmod).

`rmul(other, axis='columns', level=None, fill_value=None)` multiplication of dataframe and other, element-wise (binary operator rmul).

`rolling(window, min_periods=None, freq=None, center=False, win_type=None, axis=0)` rovides rolling transformations.
window : int, str, offset

    Size of the moving window. This is the number of observations used for calculating the statistic. The window size must not be so large as to span more than one adjacent partition. If using an offset or offset alias like ‘5D’, the data must have a DatetimeIndex

min_periods : int, default None

    Minimum number of observations in window required to have a value (otherwise result is NA).

center : boolean, default False

    Set the labels at the center of the window.

win_type : string, default None

    Provide a window type. The recognized window types are identical to pandas.

axis : int, default 0

`round(decimals=0)` round a DataFrame to a variable number of decimal places.

`rpow(other, axis='columns', level=None, fill_value=None)` exponential power of dataframe and other, element-wise (binary operator rpow).

`rsub(other, axis='columns', level=None, fill_value=None)` subtraction of dataframe and other, element-wise (binary operator rsub).

`rtruediv(other, axis='columns', level=None, fill_value=None)` floating division of dataframe and other, element-wise (binary operator rtruediv).

`sample(frac, replace=False, random_state=None)` random sample of items

`sem(axis=None, skipna=None, ddof=1, split_every=False)` return unbiased standard error of the mean over requested axis.  normalized by N-1 by default. This can be changed using the ddof argument

`shift(periods=1, freq=None, axis=0)` shift index by desired number of periods with an optional time freq

`std(axis=None, skipna=True, ddof=1, split_every=False)` return sample standard deviation over requested axis. normalized by N-1 by default. This can be changed using the ddof argument

`sub(other, axis='columns', level=None, fill_value=None)` subtraction of dataframe and other, element-wise (binary operator sub).

`sum(axis=None, skipna=True, split_every=False)` return the sum of the values for the requested axis

`truediv(other, axis='columns', level=None, fill_value=None)` floating division of dataframe and other, element-wise (binary operator truediv).

`var(axis=None, skipna=True, ddof=1, split_every=False)` return unbiased variance over requested axis.

`visualize(filename='mydask', format=None, optimize_graph=False, **kwargs)` render the computation of this object’s task graph using graphviz.  

filename : str or None, optional

    The name (without an extension) of the file to write to disk. If filename is None, no file will be written, and we communicate with dot using only pipes.

format : {‘png’, ‘pdf’, ‘dot’, ‘svg’, ‘jpeg’, ‘jpg’}, optional

    Format in which to write output file. Default is ‘png’.

optimize_graph : bool, optional

    If True, the graph is optimized before rendering. Otherwise, the graph is displayed as is. Default is False.

color: {None, ‘order’}, optional

    Options to color nodes. Provide cmap= keyword for additional colormap

**kwargs

    Additional keyword arguments to forward to to_graphviz.

`where(cond, other=nan)` return an object of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from other.

### Series - many similar as DataFrame

`class dask.dataframe.Series(dsk, name, meta, divisions)` 
	
dsk: dict

    The dask graph to compute this Series

_name: str

    The key prefix that specifies which keys in the dask comprise this particular Series

meta: pandas.Series

    An empty pandas.Series with names, dtypes, and index matching the expected output.

divisions: tuple of index values

    Values along which we partition our blocks on the index

### DataFrameGroupBy

`class dask.dataframe.groupby.DataFrameGroupBy(df, by=None, slice=None)` 

`agg(arg, split_every=None, split_out=1)` aggregate using callable, string, dict, or list of string/callables

func : callable, string, dictionary, or list of string/callables

    Function to use for aggregating the data. If a function, must either work when passed a DataFrame or when passed to DataFrame.apply. For a DataFrame, can pass a dict, if the keys are DataFrame column names.

    Accepted Combinations are:

        string function name
        function
        list of functions
        dict of column names -> functions (or list of functions)

In [270]:
df = pd.DataFrame({'A': [1, 1, 2, 2],  
                   'B': [1, 2, 3, 4],
                   'C': np.random.randn(4)})

In [271]:
# aggregation is for each column.
df.groupby('A').agg('min') 

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,-0.24703
2,3,-0.316788


In [272]:
# multiple aggregations
df.groupby('A').agg(['min', 'max'])  

Unnamed: 0_level_0,B,B,C,C
Unnamed: 0_level_1,min,max,min,max
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,1,2,-0.24703,0.654628
2,3,4,-0.316788,0.305735


In [273]:
# select a column for aggregation
df.groupby('A').B.agg(['min', 'max'])  

Unnamed: 0_level_0,min,max
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,3,4


In [274]:
# different aggregations per column
df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

Unnamed: 0_level_0,B,B,C
Unnamed: 0_level_1,min,max,sum
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1,2,0.407598
2,3,4,-0.011052


`apply(func, meta='__no_default__')` parallel version of pandas GroupBy.apply

`count(split_every=None, split_out=1)` compute count of group, excluding missing values

`cumcount(axis=None)` number each item in each group from 0 to the length of that group - 1.

In [275]:
df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
                  columns=['A'])

In [276]:
df.groupby('A').cumcount()

0    0
1    1
2    2
3    0
4    1
5    3
dtype: int64

`cumprod(axis=0)` cumulative product for each group

`cumsum(axis=0)` cumulative sum for each group

`max(split_every=None, split_out=1)` compute max of group values (`min()` is contra)

`mean(split_every=None, split_out=1)` compute mean of groups, excluding missing values

`size(split_every=None, split_out=1)` compute group sizes  

`std(ddof=1, split_every=None, split_out=1)` compute standard deviation of groups, excluding missing values

`sum(split_every=None, split_out=1)` compute sum of group values

`var(ddof=1, split_every=None, split_out=1)` compute variance of groups, excluding missing values

`class dask.dataframe.groupby.SeriesGroupBy(df, by=None, slice=None)` 

### Storage and Conversion

`dask.dataframe.read_csv(urlpath, blocksize=64000000, collection=True, lineterminator=None, compression=None, sample=256000, enforce=False, assume_missing=False, storage_options=None, **kwargs)`  

`dask.dataframe.read_table(urlpath, blocksize=64000000, collection=True, lineterminator=None, compression=None, sample=256000, enforce=False, assume_missing=False, storage_options=None, **kwargs)`  

`dask.dataframe.read_parquet(path, columns=None, filters=None, categories=None, index=None, storage_options=None, engine='auto')`  

`dask.dataframe.read_hdf(pattern, key, start=0, stop=None, columns=None, chunksize=1000000, sorted_index=False, lock=True, mode='a')`  

`dask.dataframe.read_sql_table(table, uri, index_col, divisions=None, npartitions=None, limits=None, columns=None, bytes_per_chunk=268435456, head_rows=5, schema=None, meta=None, **kwargs)`  

`dask.dataframe.from_array(x, chunksize=50000, columns=None)`  

`dask.dataframe.from_pandas(data, npartitions=None, chunksize=None, sort=True, name=None)`  

`dask.dataframe.from_dask_array(x, columns=None)`  

`dask.dataframe.from_delayed(dfs, meta=None, divisions=None, prefix='from-delayed')`  

`reverse most of these to to to_******`